예제 #1
0
파일: Item.py 프로젝트: xzhoutxd/tb
    def initItem(self):
        # 商品抓取设置
        self.crawling_time   = Common.now()
        self.crawling_begintime = '' # 本次抓取开始时间
        self.crawling_beginDate = '' # 本次爬取日期
        self.crawling_beginHour = '' # 本次爬取小时

        # 商品属性
        self.item_id         = ''   # 商品ID
        self.item_name       = ''   # 商品名称
        self.item_price      = ''   # 商品价格
        self.item_url        = ''   # 商品链接
        self.item_spuId      = ''   # SPU ID
        self.item_sellCount  = 0    # 月销售数

        self.brand_name      = ''
        self.brand_id        = ''
        self.category_id     = ''

        # 商品页
        self.item_page       = None # 商品首页

        # item html urls
        self.item_urls       = []   # 商品链接列表

        # item html pages
        #self.item_pages      = []   # 商品网页列表
        self.item_pages      = {}   # 商品网页列表

        # 成交记录
        self.deal_url        = ''
        self.deal_stopCrawl  = False
        self.deal_deadLine   = 0.0  # 上次抓取的成交记录最晚时间
        self.deal_deadLine2  = 0.0  # 本次抓取的成交记录最早时间
예제 #2
0
파일: bagItem.py 프로젝트: xzhoutxd/brand
    def __init__(self, home_url, brand_type):
        # 抓取设置
        self.crawler = MyCrawler()

        self.crawling_time = Common.now()  # 当前爬取时间
        self.crawling_beginDate = time.strftime(
            "%Y-%m-%d", time.localtime(self.crawling_time))  # 本次爬取日期
        self.crawling_beginHour = time.strftime(
            "%H", time.localtime(self.crawling_time))  # 本次爬取小时

        # 品牌官网链接
        self.home_url = home_url

        # 品牌type
        self.brand_type = brand_type

        self.serie_title = ''
        self.item_title = ''
        self.item_name = ''
        self.item_price = ''
        self.item_unit = ''
        self.item_size = ''
        self.item_url = ''
        self.item_img = ''
        self.item_number = ''
예제 #3
0
    def initItem(self):
        # 商品抓取设置
        self.crawling_time = Common.now()
        self.crawling_begintime = ''  # 本次抓取开始时间
        self.crawling_beginDate = ''  # 本次爬取日期
        self.crawling_beginHour = ''  # 本次爬取小时

        # 商品属性
        self.item_id = ''  # 商品ID
        self.item_name = ''  # 商品名称
        self.item_price = ''  # 商品价格
        self.item_url = ''  # 商品链接
        self.item_spuId = ''  # SPU ID
        self.item_sellCount = 0  # 月销售数

        self.brand_name = ''
        self.brand_id = ''
        self.category_id = ''

        # 商品页
        self.item_page = None  # 商品首页

        # item html urls
        self.item_urls = []  # 商品链接列表

        # item html pages
        #self.item_pages      = []   # 商品网页列表
        self.item_pages = {}  # 商品网页列表

        # 成交记录
        self.deal_url = ''
        self.deal_stopCrawl = False
        self.deal_deadLine = 0.0  # 上次抓取的成交记录最晚时间
        self.deal_deadLine2 = 0.0  # 本次抓取的成交记录最早时间
예제 #4
0
 def getPage(self, url):
     position = 1
     i = 1
    
     i_url = url
     refers = self.home_url
     max_page = 10
     size_page = 48
     while i <= max_page:
         page = self.crawler.getData(i_url, refers)
         refers = i_url
         i_url = url + '&bcoffset=1&s=%s' % str(i*size_page)
         i += 1
         if not page or page == '':
             print 'not find data url:',i_url
             time.sleep(4)
             continue
         m = re.search(r'<script>\s+g_page_config = ({.+?});.+?</script>', page, flags=re.S)
         if m:
             page_config = m.group(1)
             page_config_s = re.sub(r'\n+','',page_config)
             data = json.loads(page_config_s)
             if data.has_key("mods"):
                 if data["mods"].has_key("itemlist"):
                     itemlist = data["mods"]["itemlist"]
                     if itemlist.has_key("data"):
                         itemlist_data = itemlist["data"]
                         if itemlist_data.has_key("auctions"):
                             for item in itemlist_data["auctions"]:
                                 item_id = position
                                 m = re.search(r'id=(\d+)', item["detail_url"], flags=re.S)
                                 if m:
                                     item_id = m.group(1)
                                 item_sales = item["view_sales"]
                                 m = re.search(r'(\d+)', item["view_sales"], flags=re.S)
                                 if m:
                                     item_sales = m.group(1)
                                 print Common.time_s(Common.now()), position, item_id, item["raw_title"], item["view_price"], item_sales, item["user_id"], item["nick"], "http:" + item["detail_url"], "http:" + item["shopLink"]
                                 self.mysqlAccess.insert_item((Common.time_s(Common.now()), str(item_id), str(position), str(item["raw_title"]), str(item["view_price"]), str(item_sales), "http:" + item["detail_url"], item["user_id"], str(item["nick"]), "http:" + item["shopLink"]))
                                 position += 1
         time.sleep(4)
예제 #5
0
파일: TMCrawler.py 프로젝트: xzhoutxd/tb
    def __init__(self):
        # 抓取设置
        #self.crawler     = MyCrawler()
        self.crawler     = RetryCrawler()

        # db
        self.mysqlAccess  = MysqlAccess() # mysql access

        # 品牌官网链接
        self.home_url   = 'http://www.taobao.com'
        self.refers     = None

        # 抓取商品列表
        self.link_list  = []
        self.items      = []

        self.begin_time = Common.now()
예제 #6
0
파일: TMCrawler.py 프로젝트: xzhoutxd/tb
    def __init__(self):
        # 抓取设置
        #self.crawler     = MyCrawler()
        self.crawler = RetryCrawler()

        # db
        self.mysqlAccess = MysqlAccess()  # mysql access

        # 品牌官网链接
        self.home_url = 'http://www.taobao.com'
        self.refers = None

        # 抓取商品列表
        self.link_list = []
        self.items = []

        self.begin_time = Common.now()
예제 #7
0
파일: bagItem.py 프로젝트: xzhoutxd/brand
    def __init__(self, home_url, brand_type):
        # 抓取设置
        self.crawler     = MyCrawler()

        self.crawling_time = Common.now() # 当前爬取时间
        self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_time)) # 本次爬取日期
        self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_time)) # 本次爬取小时

        # 品牌官网链接
        self.home_url    = home_url

        # 品牌type
        self.brand_type = brand_type

        self.serie_title = ''
        self.item_title  = ''
        self.item_name   = ''
        self.item_price  = ''
        self.item_unit   = ''
        self.item_size   = ''
        self.item_url    = ''
        self.item_img    = ''
        self.item_number = ''