def getPagesShopUrls(first_url):
    '''return all the shop urls within the first page's restriction'''
    global shop_urls, lock
    html_text = getHtmlText(first_url)
    max_page = getMaxPageNum(getSoup(html_text))
    pages = map(lambda x: x + 1, range(max_page))
    page_urls = map(lambda x: first_url + 'p' + str(x), pages)
    for page_url in page_urls:
        page_shop_urls = getPageShops(page_url)
        with lock:
            shop_urls.extend(page_shop_urls)
            print '[=] Shop urls length: ' + str(len(shop_urls))
Exemple #2
0
def parse_file(file_name):
    global lock

    with open(file_name, 'r') as f:
        text = f.read()
    soup = getSoup(text)
    shop_id = file_name.split('_')[-1].split('.')[-2]

    d = parse_soup(soup)
    d['shop_id'] = shop_id

    with lock:
        db.bjstore.insert_one(d)
def change_page_crawled(fpath):
    global origin_is_page_crawled
    l = fpath.split('/')[-1].split('.')[0].split('_')
    num = l[-1][-1]
    sid = l[1]
    url = 'https://www.dianping.com/shop/' + sid + '/review_more?pageno=' + num
    origin_is_page_crawled[url] = True
    with open(fpath, 'r') as f:
        html_text = f.read()
    soup = getSoup(html_text)
    max_page_num = getMaxPageNum(soup)
    if max_page_num > 1:
        l = generatePageList(url, max_page_num)
        print "[+] Appending %d pages" % len(l)
        for u in l:
            origin_is_page_crawled[u] = False
Exemple #4
0
def parse_file(file_name):
    with open(file_name, 'r') as f:
        text = f.read()
    soup = getSoup(text)
    shop_id = file_name.split('_')[-2]
    try:
        items = soup.find(class_="comment-list").find_all(
            id=re.compile("rev_"))
    except AttributeError:
        yield

    for item in items:
        try:
            d = parse_item(item)
            d['shop_id'] = shop_id
            yield d
        except:
            pass
def parse_file(file_name):
    global lock

    with open(file_name, 'r') as f:
        text = f.read()
    soup = getSoup(text)
    shop_id = file_name.split('_')[-2]
    try:
        items = soup.find(class_="comment-list").find_all(
            id=re.compile("rev_"))
    except AttributeError:
        return

    def lambda_add_shop_id(d):
        d['shop_id'] = shop_id
        return d

    all_comment_dict = map(lambda_item, items)
    all_comment_dict = map(lambda_add_shop_id, all_comment_dict)
    with lock:
        try:
            db.bjfoodstore.insert_many(all_comment_dict)
        except TypeError:
            pass
        heart_num = 0
    d['heart_num'] = heart_num

    #回应数
    try:
        recomment_num = eval(item.find(class_="J_rtl").text)
    except AttributeError, TypeError:
        recomment_num = 0
    d['recomment_num'] = recomment_num

    return d


with open(file_name, 'r') as f:
    text = f.read()
soup = getSoup(text)
shop_id = file_name.split('_')[-2]
items = soup.find(class_="comment-list").find_all(id=re.compile("rev_"))
d = parse_item(items[0])

#count = 0
#
#for root, dirs, files in os.walk("/Users/xuegeng/Spider_Workspace/crawler/htmls/", topdown=False):
#    for name in files:
#        fp = os.path.join(root, name)
#        try:
#            parse_file(fp)
#            print "[+] Processing "+fp+" Complete: " + format(count/7980.0,".5f") + "%"
#            count += 1
#        except AttributeError:
#            print "[-] It might be a lousy store: "+fp