def getPagesShopUrls(first_url): '''return all the shop urls within the first page's restriction''' global shop_urls, lock html_text = getHtmlText(first_url) max_page = getMaxPageNum(getSoup(html_text)) pages = map(lambda x: x + 1, range(max_page)) page_urls = map(lambda x: first_url + 'p' + str(x), pages) for page_url in page_urls: page_shop_urls = getPageShops(page_url) with lock: shop_urls.extend(page_shop_urls) print '[=] Shop urls length: ' + str(len(shop_urls))
def parse_file(file_name): global lock with open(file_name, 'r') as f: text = f.read() soup = getSoup(text) shop_id = file_name.split('_')[-1].split('.')[-2] d = parse_soup(soup) d['shop_id'] = shop_id with lock: db.bjstore.insert_one(d)
def change_page_crawled(fpath): global origin_is_page_crawled l = fpath.split('/')[-1].split('.')[0].split('_') num = l[-1][-1] sid = l[1] url = 'https://www.dianping.com/shop/' + sid + '/review_more?pageno=' + num origin_is_page_crawled[url] = True with open(fpath, 'r') as f: html_text = f.read() soup = getSoup(html_text) max_page_num = getMaxPageNum(soup) if max_page_num > 1: l = generatePageList(url, max_page_num) print "[+] Appending %d pages" % len(l) for u in l: origin_is_page_crawled[u] = False
def parse_file(file_name): with open(file_name, 'r') as f: text = f.read() soup = getSoup(text) shop_id = file_name.split('_')[-2] try: items = soup.find(class_="comment-list").find_all( id=re.compile("rev_")) except AttributeError: yield for item in items: try: d = parse_item(item) d['shop_id'] = shop_id yield d except: pass
def parse_file(file_name): global lock with open(file_name, 'r') as f: text = f.read() soup = getSoup(text) shop_id = file_name.split('_')[-2] try: items = soup.find(class_="comment-list").find_all( id=re.compile("rev_")) except AttributeError: return def lambda_add_shop_id(d): d['shop_id'] = shop_id return d all_comment_dict = map(lambda_item, items) all_comment_dict = map(lambda_add_shop_id, all_comment_dict) with lock: try: db.bjfoodstore.insert_many(all_comment_dict) except TypeError: pass
heart_num = 0 d['heart_num'] = heart_num #回应数 try: recomment_num = eval(item.find(class_="J_rtl").text) except AttributeError, TypeError: recomment_num = 0 d['recomment_num'] = recomment_num return d with open(file_name, 'r') as f: text = f.read() soup = getSoup(text) shop_id = file_name.split('_')[-2] items = soup.find(class_="comment-list").find_all(id=re.compile("rev_")) d = parse_item(items[0]) #count = 0 # #for root, dirs, files in os.walk("/Users/xuegeng/Spider_Workspace/crawler/htmls/", topdown=False): # for name in files: # fp = os.path.join(root, name) # try: # parse_file(fp) # print "[+] Processing "+fp+" Complete: " + format(count/7980.0,".5f") + "%" # count += 1 # except AttributeError: # print "[-] It might be a lousy store: "+fp