Esempio n. 1
0
def crawl_dianping_shop(shop_id):
    global shop_count
    url = "http://www.dianping.com/shop/%d" % int(shop_id)
    #url = "http://www.dianping.com/shop/531684"
    #url = 'http://www.dianping.com/shop/2744077'
    print "shop url:%s" % url
    shop_count = shop_count + 1
    print "shop_count=%d" % shop_count

    tstart = datetime.now() 
    downloader = DownloadManager(None, None, None)
    error_msg, url, redirected_url, html =  downloader.download(url)
    tend = datetime.now() 
    c = tend - tstart 
    print c    
    
    if html is None:
        print "download error"
        return None


    # write file to local folder
    file_path = BASE_PATH + "shop/" + shop_id
    file = open(file_path,"wb")
    file.write(html)
    file.close()

    html_encoding_match  = None
    regexp = re.compile('<\s*meta[^>]+charset=[\'"]?([^>]*?)[;\'">]', re.I)
    html_encoding_match = regexp.search(html)
    if html_encoding_match is not None:
        html_encoding = html_encoding_match.groups()[0].lower()

    if html_encoding == "gb2312":
        soup = BeautifulSoup(html, fromEncoding='GB18030')
    else:
        soup = BeautifulSoup(html)

    
    # basic info block
    shop_info_inner_blocks = soup.findAll(True, {'class': re.compile(r'\bshop-info-inner\b')}) 
    for  shop_info_inner_block in shop_info_inner_blocks:
        pass
        """shop_name_tag = shop_info_inner_block.findNext('h1')
Esempio n. 2
0
def crawl_top_category_list(url): 
    global category_count
    # download this link for iphone5 
    # http://tech.sina.com.cn/z/iphone5/index.shtml
    #url = "http://www.dianping.com/search/category/9/10/g473"
    proxy = {'http' : '79.127.144.2:8080'}
    #downloader = DownloadManager(None, None, proxy)
    tstart = datetime.now() 
    downloader = DownloadManager(None, None, None)
    error_msg, url, redirected_url, html =  downloader.download(url)
    tend = datetime.now() 
    c = tend - tstart 
    print "download time"
    print c
    
    category_count = category_count + 1
    print "category_count=%d" % category_count
    
    encoding_bug = None
    print "get list"
    soup = BeautifulSoup(html)

    shop_lists= soup.find("div", {"id":"searchList"})
    #for item in shop_lists.dl:
    #    print item
    #get all shops from 
    shop_anchor_list = shop_lists.findAll('a', href=re.compile('/shop/(\d+)', re.I))
    for link in shop_anchor_list:
        p = re.compile('/shop/(\d+)', re.I)
        m = p.match(link['href'])
        g = m.group(0)
        g = m.group(1)
        print "shop id:%s" % g
        crawl_dianping_shop(g)
        print link['href']

    #get category
    #http://www.dianping.com/search/category/9/10/g473p2
    #http://www.dianping.com/search/category/9/10/g473r45/g10g473r45
    category_lists= soup.findAll("a", href=re.compile('/search/category/.+', re.I))
    for link in category_lists:
        url = "http://www.dianping.com" + link['href']
        crawl_top_category_list(url)