Beispiel #1
0
def crawl_dianping_shop(shop_id):
    global shop_count
    url = "http://www.dianping.com/shop/%d" % int(shop_id)
    #url = "http://www.dianping.com/shop/531684"
    #url = 'http://www.dianping.com/shop/2744077'
    print "shop url:%s" % url
    shop_count = shop_count + 1
    print "shop_count=%d" % shop_count

    tstart = datetime.now() 
    downloader = DownloadManager(None, None, None)
    error_msg, url, redirected_url, html =  downloader.download(url)
    tend = datetime.now() 
    c = tend - tstart 
    print c    
    
    if html is None:
        print "download error"
        return None


    # write file to local folder
    file_path = BASE_PATH + "shop/" + shop_id
    file = open(file_path,"wb")
    file.write(html)
    file.close()

    html_encoding_match  = None
    regexp = re.compile('<\s*meta[^>]+charset=[\'"]?([^>]*?)[;\'">]', re.I)
    html_encoding_match = regexp.search(html)
    if html_encoding_match is not None:
        html_encoding = html_encoding_match.groups()[0].lower()

    if html_encoding == "gb2312":
        soup = BeautifulSoup(html, fromEncoding='GB18030')
    else:
        soup = BeautifulSoup(html)

    
    # basic info block
    shop_info_inner_blocks = soup.findAll(True, {'class': re.compile(r'\bshop-info-inner\b')}) 
    for  shop_info_inner_block in shop_info_inner_blocks:
        pass
        """shop_name_tag = shop_info_inner_block.findNext('h1')
Beispiel #2
0
def crawl_top_category_list(url): 
    global category_count
    # download this link for iphone5 
    # http://tech.sina.com.cn/z/iphone5/index.shtml
    #url = "http://www.dianping.com/search/category/9/10/g473"
    proxy = {'http' : '79.127.144.2:8080'}
    #downloader = DownloadManager(None, None, proxy)
    tstart = datetime.now() 
    downloader = DownloadManager(None, None, None)
    error_msg, url, redirected_url, html =  downloader.download(url)
    tend = datetime.now() 
    c = tend - tstart 
    print "download time"
    print c
    
    category_count = category_count + 1
    print "category_count=%d" % category_count
    
    encoding_bug = None
    print "get list"
    soup = BeautifulSoup(html)

    shop_lists= soup.find("div", {"id":"searchList"})
    #for item in shop_lists.dl:
    #    print item
    #get all shops from 
    shop_anchor_list = shop_lists.findAll('a', href=re.compile('/shop/(\d+)', re.I))
    for link in shop_anchor_list:
        p = re.compile('/shop/(\d+)', re.I)
        m = p.match(link['href'])
        g = m.group(0)
        g = m.group(1)
        print "shop id:%s" % g
        crawl_dianping_shop(g)
        print link['href']

    #get category
    #http://www.dianping.com/search/category/9/10/g473p2
    #http://www.dianping.com/search/category/9/10/g473r45/g10g473r45
    category_lists= soup.findAll("a", href=re.compile('/search/category/.+', re.I))
    for link in category_lists:
        url = "http://www.dianping.com" + link['href']
        crawl_top_category_list(url)
Beispiel #3
0
    def __init__(self, user, pwd):
        self.BUY_SYSTEM_GOODS_PRICE_MAX = 0
        self.BUY_SYSTEM_GOODS_PRICE_MIN = 999999999
        self.STALL_GOODS_QUANTITY_MAX = 0
        self.BUY_USER_GOODS_PRICE_MAX = 0
        self.BUY_USER_GOODS_PRICE_MIN = 999999999
        self.STORE_GOODS_QUANTITY_MAX = 0
        self.MONEY_KEEP = 0

        self.URL_KAIXIN_HOME = "http://kaixin001.com/"
        self.URL_KAIXIN_LOGIN = '******'

        self.downloader = DownloadManager()
        self.kaixindb =  WebpageDB(user+'.db')
        self.load_settings(user+'.cfg') 
        self.MY_STALL_ID = None
        self.MY_TOTAL_MONEY = None
Beispiel #4
0
import time
from crawler.downloader import DownloadManager# python-crawler
from crawler.webpage import WebPage  # python-crawler
from crawler.database import WebpageDB

import lxml.html    # python-lxml

import json
import time
import random
import re
import sys
import json
import random
downloader = DownloadManager()
item_prices = {}

def login(user, pwd):

    url = "http://kaixin001.com/"
    error_msg, url, redirected_url, html = download(url)

    page = WebPage(url, html)
    action, fields = page.get_form(0)
    fields['email'] = user
    fields['password'] = pwd
    fields['remember'] = 0

    url = 'http://www.kaixin001.com/login/login.php'
    error_msg, url, redirected_url, html = download(url , fields)