def get_info(): global lock, f_success, f_asins, tool lock = Lock() tool = httptools.httptools() f1 = open(config.category_listings_path + 'less_9800.csv') lines = f1.readlines() f1.close() f2 = open(config.category_listings_path + 'split_more9800.csv') lines2 = f2.readlines() f2.close() lines.extend(lines2) f_asins = open(config.get_ids_path + 'ids.csv', 'w') f_success = open(config.get_ids_path + 'success_url.csv', 'w') f_success.write('listings_count' + '\t' + 'id_count' + '\t' + 'url' + '\n') f_success.flush() pool = Pool(15) pool.map(handle, lines) pool.close() pool.join() f_asins.close() f_success.close()
def handle(): global lock, tool, success_file tool = httptools.httptools() lock = Lock() success_file = open(config.pro_html_path + 'ids_FromProHtml.txt', 'aw') path = config.get_ids_path + 'pro_ids.csv' f1 = open(path) while True: #为了减少内存使用,每次只读取10000行进行抓取 lines = [] for i in range(10000): line = f1.readline() if not line: break lines.append(line) if lines == []: break pool = Pool(10) pool.map(get_product_html, lines) pool.close() pool.join() if not line: break f1.close() success_file.close()
def get_root_listings(): root_cate = config.root_category_id url = config.url.replace('[category]', root_cate) tool = httptools.httptools() html = tool.gethtml(url) num = regex_listing.search(html) if num != None: num = num.group(1) num = regex_num.search(num) if num != None: num = num.group() num = num.replace(',', '') if num != '': return num return 0
def handle(): global tool, f, lock tool = httptools.httptools() lock = Lock() path = config.category_listings_path + 'leaf_cate_id.csv' f = open(path, 'w') root_id = config.root_category_id cate_list = get_current_page_info(root_id) pool = Pool(15) pool.map(get_leaf_cate_id, cate_list) pool.close() pool.join() f.close()
def get_suit_url(): global f_suit_url, lock, tool tool = httptools.httptools() lock = Lock() f = open(config.category_listings_path + 'more_9800.csv') lines = f.readlines() f.close() f_suit_url = open(config.category_listings_path + 'split_more9800.csv', 'w') pool = Pool(15) pool.map(handle_url, lines) pool.close() pool.join() f_suit_url.close()
def get_status(): print '1 get spot status' print '2 pass' num_input = raw_input('input number:') if judge(num_input, 2) == 0: print 'Input is not legal, please re-enter.' get_status() if num_input == '2': return tool = httptools.httptools() root_url = config.url.replace('[category]', config.root_category_id) listings = get_listings(tool, root_url) except_space = int(listings) * 0.1745 / 1024 left_space = get_left_space() print 'listings:', listings print 'excepted space:', str(int(except_space)) + 'G' print 'left space:', str(left_space) + 'G' get_running_process()
def get_category_listings(): global lock, f_less, f_more, tool tool = httptools.httptools() lock = Lock() # leafs=getLeaf(config.root_category_id) path = config.category_listings_path + 'leaf_cate_id.csv' f = open(path, 'r') leafs = f.readlines() f.close() f_less = open(config.category_listings_path + 'less_9800.csv', 'w') f_more = open(config.category_listings_path + 'more_9800.csv', 'w') pool = Pool(15) pool.map(handle, leafs) pool.close() pool.join() f_less.close() f_more.close()