예제 #1
0
    def crawl(self):
        while True:
            _data = None
            try:
                try:
                    # 取队列消息
                    _data = self.get_q()
                except Empty as e:
                    # 队列为空,退出
                    #print '# queue is empty', e
                    break

                _val = _data[1]
                item = BagItem(self.home_url, self.brand_type)
                item.antPage(_val)
                self.push_back(self.items, item.outItem())

                sql = item.outTuple()
                self.mysqlAccess.insert_item(sql)

                # 延时
                time.sleep(0.1)
                # 通知queue, task结束
                self.queue.task_done()

            except Exception as e:
                print 'Unknown exception crawl item :', e
                Common.traceback_log()
                self.crawlRetry(_data)
                # 通知queue, task结束
                self.queue.task_done()
                time.sleep(5)
예제 #2
0
 def checkPage(self, url, data):
     # 异常处理1: 网站deny页
     m = re.search(r'<TITLE>403拒绝访问</TITLE>', data)
     if m:
         e = "# Deny page: 403拒绝访问错误, url=%s" %url
         raise Common.DenypageException(e)
     
     # 异常处理2: 页面不存在
     m = re.search(r'<div class=".+?">很抱歉,您查看的页面找不到了!</div>', data)
     if m:
         e = "# No page: 很抱歉,您查看的页面找不到了!, url=%s" %url
         raise Common.NoPageException(e)        
예제 #3
0
    def initItem(self):
        # 商品抓取设置
        self.crawling_time = Common.now()
        self.crawling_begintime = ''  # 本次抓取开始时间
        self.crawling_beginDate = ''  # 本次爬取日期
        self.crawling_beginHour = ''  # 本次爬取小时

        # 商品属性
        self.item_id = ''  # 商品ID
        self.item_name = ''  # 商品名称
        self.item_price = ''  # 商品价格
        self.item_url = ''  # 商品链接
        self.item_spuId = ''  # SPU ID
        self.item_sellCount = 0  # 月销售数

        self.brand_name = ''
        self.brand_id = ''
        self.category_id = ''

        # 商品页
        self.item_page = None  # 商品首页

        # item html urls
        self.item_urls = []  # 商品链接列表

        # item html pages
        #self.item_pages      = []   # 商品网页列表
        self.item_pages = {}  # 商品网页列表

        # 成交记录
        self.deal_url = ''
        self.deal_stopCrawl = False
        self.deal_deadLine = 0.0  # 上次抓取的成交记录最晚时间
        self.deal_deadLine2 = 0.0  # 本次抓取的成交记录最早时间
예제 #4
0
 def outItemSql(self):
     return (Common.time_s(self.crawling_time), self.item_id,
             self.item_name, self.item_price, self.item_sellCount,
             self.item_url, self.seller_id, self.seller_name, self.shop_id,
             self.shop_name, self.shop_url, self.brand_id, self.brand_name,
             self.category_id, self.crawling_beginDate,
             self.crawling_beginHour)
예제 #5
0
파일: Item.py 프로젝트: xzhoutxd/tb
    def initItem(self):
        # 商品抓取设置
        self.crawling_time   = Common.now()
        self.crawling_begintime = '' # 本次抓取开始时间
        self.crawling_beginDate = '' # 本次爬取日期
        self.crawling_beginHour = '' # 本次爬取小时

        # 商品属性
        self.item_id         = ''   # 商品ID
        self.item_name       = ''   # 商品名称
        self.item_price      = ''   # 商品价格
        self.item_url        = ''   # 商品链接
        self.item_spuId      = ''   # SPU ID
        self.item_sellCount  = 0    # 月销售数

        self.brand_name      = ''
        self.brand_id        = ''
        self.category_id     = ''

        # 商品页
        self.item_page       = None # 商品首页

        # item html urls
        self.item_urls       = []   # 商品链接列表

        # item html pages
        #self.item_pages      = []   # 商品网页列表
        self.item_pages      = {}   # 商品网页列表

        # 成交记录
        self.deal_url        = ''
        self.deal_stopCrawl  = False
        self.deal_deadLine   = 0.0  # 上次抓取的成交记录最晚时间
        self.deal_deadLine2  = 0.0  # 本次抓取的成交记录最早时间
예제 #6
0
파일: bagItem.py 프로젝트: xzhoutxd/brand
    def __init__(self, home_url, brand_type):
        # 抓取设置
        self.crawler = MyCrawler()

        self.crawling_time = Common.now()  # 当前爬取时间
        self.crawling_beginDate = time.strftime(
            "%Y-%m-%d", time.localtime(self.crawling_time))  # 本次爬取日期
        self.crawling_beginHour = time.strftime(
            "%H", time.localtime(self.crawling_time))  # 本次爬取小时

        # 品牌官网链接
        self.home_url = home_url

        # 品牌type
        self.brand_type = brand_type

        self.serie_title = ''
        self.item_title = ''
        self.item_name = ''
        self.item_price = ''
        self.item_unit = ''
        self.item_size = ''
        self.item_url = ''
        self.item_img = ''
        self.item_number = ''
예제 #7
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return

        tab_list_info = ''
        m = re.search(r'<li class="mega-menu-item">\s*<div.+?data-megaMenu="women".*?>\s*<span class="onlyML">手袋</span>.+?</div>\s*<div class="mega-menu-content">.+?<ul class="level3">(.+?)</ul>', page, flags=re.S)
        if m:
            tab_list_info = m.group(1).strip()

        tab_list = []
        p = re.compile(r'<li class="mm-block">.+?<a.+?href="(.+?)">\s*<p class="mm-push-p">(.+?)</p>\s*</a>.+?</li>', flags=re.S)
        for tab_info in p.finditer(tab_list_info):
            tab_list.append((tab_info.group(2).strip(),self.home_url+tab_info.group(1)))
            print tab_info.group(2).strip(),self.home_url+tab_info.group(1)

        i = 0
        for tab in tab_list:
            refers = url
            tab_name, tab_url = tab
            print '# tab:',tab_name, tab_url
            tab_page = self.crawler.getData(tab_url, refers)
            m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S)
            while m:
                refers = tab_url
                tab_url = re.sub(r'/to-\d+', '', tab_url)  + "/to-%s"%m.group(1)
                tab_page = self.crawler.getData(tab_url, refers)
                m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S)

            p = re.compile(r'<a id="sku_.*?" href="(.+?)".+?>.+?<div class="description">\s*<div class="productName toMinimize">(.+?)</div>\s*<div class="productPrice.+?data-htmlContent="(.+?)">\s*</div>\s*</div>', flags=re.S)
            for item in p.finditer(tab_page):
                i_url, i_name, s_price = item.group(1),item.group(2),item.group(3)
                print self.home_url+i_url, i_name, s_price
                i_unit = ""
                if s_price.find("¥") != -1:
                    i_unit = "CNY"
                i_price = s_price.replace('¥','').strip()
                
                if i_url and i_url != '':
                    if Common.isBag(i_name):
                        self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_price,i_unit))
                else:
                    if Common.isBag(i_name):
                        i = BagItem(self.brand_type)
                        i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, '')
                        self.items.append(i.outItem())
예제 #8
0
파일: Item.py 프로젝트: xzhoutxd/tb
    def TMItem(self):
        if self.item_url != '':
            page = self.crawler.getData(self.item_url, self.refers)
            if not page or page == '':
                raise Common.InvalidPageException("# TMItem: not find item page,itemid:%s,item_url:%s"%(str(self.item_id), self.item_url))

            m = re.search(r'sellerId:"(\d+)",', page, flags=re.S)
            if m:
                self.seller_id = m.group(1)
            m = re.search(r'shopId:"(\d+)",', page, flags=re.S)
            if m:
                self.shop_id = m.group(1)
            m = re.search(r'<div class="slogo">\s*<a class="slogo-shopname" href="(.+?)".+?><strong>(.+?)</strong></a>', page, flags=re.S)
            if m:
                self.shop_url, self.shop_name = Common.fix_url(m.group(1)), m.group(2).strip()

            m = re.search(r'TShop\.Setup\((.+?)\);', page, flags=re.S)
            if m:
                TShop_s = m.group(1).strip()
                m = re.search(r'"brand":"(.+?)",', TShop_s, flags=re.S)
                if m:
                    self.brand_name = Common.htmlDecode(m.group(1).strip())
                m = re.search(r'"brandId":"(\d+)",', TShop_s, flags=re.S)
                if m:
                    self.brand_id = m.group(1)
                m = re.search(r'"categoryId":"(\d+)",', TShop_s, flags=re.S)
                if m:
                    self.category_id = m.group(1)
                m = re.search(r'"sellerNickName":"(.+?)",', TShop_s, flags=re.S)
                if m:
                    self.seller_name = Common.urlDecode(m.group(1).strip())

                m = re.search(r'"initApi":"(.+?)",', TShop_s, flags=re.S)
                if m:
                    ts = "&callback=setMdskip&timestamp=%s" % str(int(time.time()*1000))
                    initapi_url = Common.fix_url(m.group(1).strip()) + ts + "&ref=%s" % Common.urlCode(self.refers)
                    init_page = self.crawler.getData(initapi_url, self.item_url)
                    if not init_page and init_page == '':
                        print '# init page is null..'
                    else:
                        m = re.search(r'"sellCountDO":{"sellCount":(\d+),', init_page, flags=re.S)
                        if m:
                            self.item_sellCount = m.group(1)
예제 #9
0
파일: TMCrawler.py 프로젝트: xzhoutxd/tb
    def getPage(self, url, shop_home_url):
        position = 1
        i = 1
        max_page = 0

        asyn_url = ''
        i_url = url
        refers = shop_home_url
        result_s = self.get_asyn_data(i_url, refers, shop_home_url)
        m = re.search(r'<b class="ui-page-s-len">\d+/(\d+)</b>',
                      result_s,
                      flags=re.S)
        if m:
            max_page = int(m.group(1))
        print '# page num:', max_page
        while i <= max_page:
            m = re.search(
                r'<div class="J_TItems">(.+?)<div class="pagination">',
                result_s,
                flags=re.S)
            if m:
                items_s = m.group(1)
                p = re.compile(
                    r'<dl class=".+?".+?data-id="(.+?)">.+?<dd class="detail">\s*<a class="item-name".+?href="(.+?)".+?>(.+?)</a>\s*<div class="attribute">\s*<div class="cprice-area">\s*<span class="symbol">(.+?)</span>\s*<span\s*class="c-price">(.+?)</span>\s*</div>.+?</dl>'
                )
                j = 1
                for item in p.finditer(items_s):
                    item_id, url_s, item_name, price_symbol, price = item.group(
                        1), item.group(2), Common.htmlDecode(
                            item.group(3).strip()), item.group(
                                4).strip(), item.group(5).strip()
                    if url_s.find('http') == -1:
                        item_url = 'http:' + url_s
                    else:
                        item_url = url_s
                    print '### item ###'
                    print '# item val:', item_id, item_name, price, item_url
                    item = Item()
                    item.parserTM((item_id, item_name, price, item_url, i_url,
                                   self.begin_time))
                    print '# item info:', item.outItemSql()
                    self.mysqlAccess.insert_parser_item_info(item.outItemSql())
                    time.sleep(2)

            refers = i_url
            if i_url.find('pageNo=') == -1:
                i_url = re.sub(r'&tsearch=y',
                               '&pageNo=%d&tsearch=y#anchor' % i, refers)
            else:
                i_url = re.sub(r'&pageNo=\d+&', '&pageNo=%d&' % i, refers)

            i += 1
            time.sleep(2)
            result_s = self.get_asyn_data(i_url, refers, shop_home_url)
예제 #10
0
 def getPage(self, url):
     position = 1
     i = 1
    
     i_url = url
     refers = self.home_url
     max_page = 10
     size_page = 48
     while i <= max_page:
         page = self.crawler.getData(i_url, refers)
         refers = i_url
         i_url = url + '&bcoffset=1&s=%s' % str(i*size_page)
         i += 1
         if not page or page == '':
             print 'not find data url:',i_url
             time.sleep(4)
             continue
         m = re.search(r'<script>\s+g_page_config = ({.+?});.+?</script>', page, flags=re.S)
         if m:
             page_config = m.group(1)
             page_config_s = re.sub(r'\n+','',page_config)
             data = json.loads(page_config_s)
             if data.has_key("mods"):
                 if data["mods"].has_key("itemlist"):
                     itemlist = data["mods"]["itemlist"]
                     if itemlist.has_key("data"):
                         itemlist_data = itemlist["data"]
                         if itemlist_data.has_key("auctions"):
                             for item in itemlist_data["auctions"]:
                                 item_id = position
                                 m = re.search(r'id=(\d+)', item["detail_url"], flags=re.S)
                                 if m:
                                     item_id = m.group(1)
                                 item_sales = item["view_sales"]
                                 m = re.search(r'(\d+)', item["view_sales"], flags=re.S)
                                 if m:
                                     item_sales = m.group(1)
                                 print Common.time_s(Common.now()), position, item_id, item["raw_title"], item["view_price"], item_sales, item["user_id"], item["nick"], "http:" + item["detail_url"], "http:" + item["shopLink"]
                                 self.mysqlAccess.insert_item((Common.time_s(Common.now()), str(item_id), str(position), str(item["raw_title"]), str(item["view_price"]), str(item_sales), "http:" + item["detail_url"], item["user_id"], str(item["nick"]), "http:" + item["shopLink"]))
                                 position += 1
         time.sleep(4)
예제 #11
0
 def run_items(self, items_info, tab_name, tab_url):
     p = re.compile(r'<.+?>\s*<a.+?>\s*<img.+?src="(.+?)".*?>\s*</a><a href="(.+?)">.+?<div class="infoDescription">(.+?)</div>\s*(<div class="infoPrice">.+?</div>).+?</a>\s*<.+?>', flags=re.S)
     for item in p.finditer(items_info):
         i_img, i_url, s_name, price_info = item.group(1),item.group(2),item.group(3),item.group(4)
         i_name = re.sub(r'<.+?>','',s_name)
         i_price, i_unit = '', ''
         m = re.search(r'<div.+?class="newprice">(.+?)</div>', price_info, flags=re.S)
         if m:
             s_price = re.sub(r'<.+?>','',m.group(1))
             if s_price.find("¥") != -1:
                 i_unit = "CNY"
             i_price = s_price.replace('¥','').strip()
         
         if i_url and i_url != '':
             if Common.isBag(i_name) or Common.isBag(unquote(i_url)):
                 print self.home_url+i_url, i_img, i_name, i_price, i_unit
                 self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_img,i_price,i_unit))
         else:
             if Common.isBag(i_name) or Common.isBag(unquote(i_url)):
                 print self.home_url+i_url, i_img, i_name, i_price, i_unit
                 i = BagItem(self.brand_type)
                 i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img)
                 self.items.append(i.outItem())
예제 #12
0
파일: TMCrawler.py 프로젝트: xzhoutxd/tb
 def get_asyn_data(self, i_url, refers, shop_home_url):
     result = ''
     result_s = ''
     page = self.crawler.getData(i_url, refers)
     m = re.search(r'<input id="J_ShopAsynSearchURL".+?value="(.+?)"\s*/>', page, flags=re.S)
     if m:
         ts = '?_ksTS=%s&callback=jsonp135&' % (str(int(time.time()*1000)) + '_' + str(random.randint(100,999)))
         a_url = shop_home_url + Common.htmlDecode(m.group(1))
         asyn_url = re.sub('\?', ts, a_url)
         result = self.crawler.getData(asyn_url, i_url)
         m = re.search(r'jsonp135\("(.+?)"\)', result, flags=re.S)
         if m:
             result_s = re.sub(r'\\"', '"', m.group(1))
     return result_s
예제 #13
0
    def TMItem(self):
        if self.item_url != '':
            page = self.crawler.getData(self.item_url, self.refers)
            if not page or page == '':
                raise Common.InvalidPageException(
                    "# TMItem: not find item page,itemid:%s,item_url:%s" %
                    (str(self.item_id), self.item_url))

            m = re.search(r'sellerId:"(\d+)",', page, flags=re.S)
            if m:
                self.seller_id = m.group(1)
            m = re.search(r'shopId:"(\d+)",', page, flags=re.S)
            if m:
                self.shop_id = m.group(1)
            m = re.search(
                r'<div class="slogo">\s*<a class="slogo-shopname" href="(.+?)".+?><strong>(.+?)</strong></a>',
                page,
                flags=re.S)
            if m:
                self.shop_url, self.shop_name = Common.fix_url(
                    m.group(1)), m.group(2).strip()

            m = re.search(r'TShop\.Setup\((.+?)\);', page, flags=re.S)
            if m:
                TShop_s = m.group(1).strip()
                m = re.search(r'"brand":"(.+?)",', TShop_s, flags=re.S)
                if m:
                    self.brand_name = Common.htmlDecode(m.group(1).strip())
                m = re.search(r'"brandId":"(\d+)",', TShop_s, flags=re.S)
                if m:
                    self.brand_id = m.group(1)
                m = re.search(r'"categoryId":"(\d+)",', TShop_s, flags=re.S)
                if m:
                    self.category_id = m.group(1)
                m = re.search(r'"sellerNickName":"(.+?)",',
                              TShop_s,
                              flags=re.S)
                if m:
                    self.seller_name = Common.urlDecode(m.group(1).strip())

                m = re.search(r'"initApi":"(.+?)",', TShop_s, flags=re.S)
                if m:
                    ts = "&callback=setMdskip&timestamp=%s" % str(
                        int(time.time() * 1000))
                    initapi_url = Common.fix_url(m.group(1).strip(
                    )) + ts + "&ref=%s" % Common.urlCode(self.refers)
                    init_page = self.crawler.getData(initapi_url,
                                                     self.item_url)
                    if not init_page and init_page == '':
                        print '# init page is null..'
                    else:
                        m = re.search(r'"sellCountDO":{"sellCount":(\d+),',
                                      init_page,
                                      flags=re.S)
                        if m:
                            self.item_sellCount = m.group(1)
예제 #14
0
파일: TMCrawler.py 프로젝트: xzhoutxd/tb
 def get_asyn_data(self, i_url, refers, shop_home_url):
     result = ''
     result_s = ''
     page = self.crawler.getData(i_url, refers)
     m = re.search(r'<input id="J_ShopAsynSearchURL".+?value="(.+?)"\s*/>',
                   page,
                   flags=re.S)
     if m:
         ts = '?_ksTS=%s&callback=jsonp135&' % (str(int(
             time.time() * 1000)) + '_' + str(random.randint(100, 999)))
         a_url = shop_home_url + Common.htmlDecode(m.group(1))
         asyn_url = re.sub('\?', ts, a_url)
         result = self.crawler.getData(asyn_url, i_url)
         m = re.search(r'jsonp135\("(.+?)"\)', result, flags=re.S)
         if m:
             result_s = re.sub(r'\\"', '"', m.group(1))
     return result_s
예제 #15
0
파일: TMCrawler.py 프로젝트: xzhoutxd/tb
    def __init__(self):
        # 抓取设置
        #self.crawler     = MyCrawler()
        self.crawler = RetryCrawler()

        # db
        self.mysqlAccess = MysqlAccess()  # mysql access

        # 品牌官网链接
        self.home_url = 'http://www.taobao.com'
        self.refers = None

        # 抓取商品列表
        self.link_list = []
        self.items = []

        self.begin_time = Common.now()
예제 #16
0
파일: TMCrawler.py 프로젝트: xzhoutxd/tb
    def __init__(self):
        # 抓取设置
        #self.crawler     = MyCrawler()
        self.crawler     = RetryCrawler()

        # db
        self.mysqlAccess  = MysqlAccess() # mysql access

        # 品牌官网链接
        self.home_url   = 'http://www.taobao.com'
        self.refers     = None

        # 抓取商品列表
        self.link_list  = []
        self.items      = []

        self.begin_time = Common.now()
예제 #17
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return
       
        tab_list = []
        m = re.search(r'<p>女装配饰</p>\s+<ul class="menu-level-4">(.+?)</ul>', page, flags=re.S)
        if m:
            tabs_list_info = m.group(1)

            p = re.compile(r'<li class="submenu-item">\s+<a.+?href="(.+?)">\s+<span lang="fr" class="lang-fr">(.+?)</span>(.+?)</a>\s+</li>', flags=re.S)
            for tab in p.finditer(tabs_list_info):
                tab_list.append((tab.group(2)+tab.group(3).strip(),tab.group(1)))

        for tab in tab_list:
            tab_name,tab_url = tab
            print '# tab:',tab_name,tab_url
            tab_page = self.crawler.getData(tab_url, url)

            m = re.search(r'<div id="layer-1".+?>\s+<a href="(.+?)" class="layer-links">.+?</a>', tab_page, flags=re.S)
            if m:
                ajax_url = self.home_url + m.group(1) + "?ajax=true&fragment=true"
                ajax_data = self.crawler.getData(ajax_url, tab_url)

                if ajax_data:
                    #data = json.loads(ajax_data)
                    #if data and data.has_key("html"):
                    #    print data["html"].decode("unicode-escape")
                    r_data = ajax_data.decode("unicode-escape")
                    if r_data:
                        m = re.search(r'"html":"(.+?)"}', r_data, flags=re.S)
                        if m:
                            data_html = m.group(1).replace("\/","/")
                            #print data_html
                            #break
                            p = re.compile(r'<li class="lookbook-item line" data-idlook="\d+">\s+<div class="disp-n">.+?<div class="look-info article">\s+<p>(.+?)</p>.+?<p class="look-ref">(.+?)</p>.+?</div>.+?</div>\s+<a href="(.+?)".+?>.+?<img .+?data-src="(.+?)".*?/>\s+</li>', flags=re.S)
                            for item in p.finditer(data_html):
                                i_url, i_img, s_number, i_name = self.home_url+item.group(3), item.group(4), item.group(2), re.sub(r'<.+?>','',item.group(1)).strip()
                                i_number = ''
                                m = re.search(r'<span class="look-ref-sku">\s*<span.+?>(.+?)</span>\s*</span>', s_number, flags=re.S)
                                if m:
                                    i_number = m.group(1)
                                print i_url, i_img, i_name, i_number
                                if Common.isBag(i_name):
                                    self.link_list.append((tab_name, tab_url, i_name, i_url, i_img, i_number))
예제 #18
0
파일: bagItem.py 프로젝트: xzhoutxd/brand
    def __init__(self, home_url, brand_type):
        # 抓取设置
        self.crawler     = MyCrawler()

        self.crawling_time = Common.now() # 当前爬取时间
        self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_time)) # 本次爬取日期
        self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_time)) # 本次爬取小时

        # 品牌官网链接
        self.home_url    = home_url

        # 品牌type
        self.brand_type = brand_type

        self.serie_title = ''
        self.item_title  = ''
        self.item_name   = ''
        self.item_price  = ''
        self.item_unit   = ''
        self.item_size   = ''
        self.item_url    = ''
        self.item_img    = ''
        self.item_number = ''
예제 #19
0
    def getData(self, url, refers='', decode=True, terminal='1'):
        # when null url, exit function
        if not url or not re.search(r'http://', url):
            return None

        # To build header
        _header = self.buildHeader(refers, terminal)

        # To forge vip cookie
        _cookie = self.session_cookie if self.use_cookie else self.crawl_cookie
       
        # 打开连接收取数据
        r = self.crawler.session.get(url, headers=_header, cookies=_cookie, timeout=self.timeout)

        # 网页内容
        data = r.content

        # 检查是否重定向
        self.forward = (len(r.history) > 0) 

        # 跟踪cookie
        if not self.use_cookie and len(r.cookies) > 0: self.crawl_cookie = Common.cookieJar2Dict(r.cookies)

        # 网页编码
        self.f_coder = self.charset(r.headers.get('content-type'))

        # 关闭结果
        r.close()

        # 网页编码归一化
        if decode and self.f_coder != self.t_coder: data = data.decode(self.f_coder,'ignore').encode(self.t_coder,'ignore')

        # pc/wap网页异常
        if terminal in ['1', '2']: self.checkPage(url, data)

        # 返回抓取结果
        return data
예제 #20
0
파일: MysqlAccess.py 프로젝트: xzhoutxd/tb
 def insert_item(self, args):
     try:
         sql = 'replace into nd_tb_parser_item(crawl_time,item_id,position,item_name,item_price,item_sale,item_url,seller_id,seller_name,shop_url) values(%s)' % Common.aggregate(10)
         self.db.execute(sql, args)
     except Exception, e:
         print '# insert tb item exception:', e
예제 #21
0
파일: MysqlAccess.py 프로젝트: xzhoutxd/tb
 def insert_parser_item_info(self, args):
     try:
         sql = 'replace into nd_tb_parser_item_info(crawl_time,item_id,item_name,item_price,item_sale,item_url,seller_id,seller_name,shop_id,shop_name,shop_url,brand_id,brand_name,category_id,c_begindate,c_beginhour) values(%s)' % Common.aggregate(16)
         self.db.execute(sql, args)
     except Exception, e:
         print '# insert tb shop item exception:', e
예제 #22
0
 def insert_item(self, args):
     try:
         sql = 'replace into nd_brand_parser_item(crawl_time,brand_name,serie_title,item_type,item_name,item_price,item_unit,item_size,item_url,item_img_url,item_number,c_begindate,c_beginhour) values(%s)' % Common.aggregate(13)
         self.brand_db.execute(sql, args)
     except Exception, e:
         print '# insert brand item exception:', e
예제 #23
0
파일: TMCrawler.py 프로젝트: xzhoutxd/tb
    def getPage(self, url, shop_home_url):
        position = 1
        i = 1
        max_page = 0
       
        asyn_url = ''
        i_url = url
        refers = shop_home_url
        result_s = self.get_asyn_data(i_url, refers, shop_home_url)
        m = re.search(r'<b class="ui-page-s-len">\d+/(\d+)</b>', result_s, flags=re.S) 
        if m:
            max_page = int(m.group(1))
        print '# page num:', max_page
        while i <= max_page:
            m = re.search(r'<div class="J_TItems">(.+?)<div class="pagination">', result_s, flags=re.S)
            if m:
                items_s = m.group(1)
                p = re.compile(r'<dl class=".+?".+?data-id="(.+?)">.+?<dd class="detail">\s*<a class="item-name".+?href="(.+?)".+?>(.+?)</a>\s*<div class="attribute">\s*<div class="cprice-area">\s*<span class="symbol">(.+?)</span>\s*<span\s*class="c-price">(.+?)</span>\s*</div>.+?</dl>')
                j = 1
                for item in p.finditer(items_s):
                    item_id, url_s, item_name, price_symbol, price = item.group(1), item.group(2), Common.htmlDecode(item.group(3).strip()), item.group(4).strip(), item.group(5).strip()
                    if url_s.find('http') == -1:
                        item_url = 'http:' + url_s
                    else:
                        item_url = url_s
                    print '### item ###'
                    print '# item val:', item_id, item_name, price, item_url
                    item = Item()
                    item.parserTM((item_id, item_name, price, item_url, i_url, self.begin_time))
                    print '# item info:',item.outItemSql()
                    self.mysqlAccess.insert_parser_item_info(item.outItemSql())
                    time.sleep(2)
            
            refers = i_url
            if i_url.find('pageNo=') == -1:
                i_url = re.sub(r'&tsearch=y','&pageNo=%d&tsearch=y#anchor' % i, refers)
            else:
                i_url = re.sub(r'&pageNo=\d+&','&pageNo=%d&' % i, refers)

            i += 1
            time.sleep(2)
            result_s = self.get_asyn_data(i_url, refers, shop_home_url)
예제 #24
0
            r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, '', '', i_size, i_url, i_img,
                   i_number)
        print '# itemPage:', i.outItem()
        #self.items.append(i.outItem())
        #print '# itemPage :', serie_title, i_name, i_url, i_img, i_size

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))


if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = BottegavenetaBag()
    b_url = "http://www.bottegaveneta.com/wy/%E5%A5%B3%E5%A3%AB/onlineboutique/%E6%89%8B%E8%A2%8B"
    b.bagPage(b_url)
    b.bagItems()

    f = Config.dataPath + 'bottegaveneta_%s.txt' % Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
예제 #25
0
파일: bossBage.py 프로젝트: xzhoutxd/brand
        m = re.search(
            r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url,
                   i_img, i_number)
        print '# itemPage:', i.outItem()
        #self.items.append(i.outItem())

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))


if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = BossBag()
    b_url = "http://store.hugoboss.cn/category.php?id=3835&form_nav"
    b.bagPage(b_url)
    b.bagItems()

    f = Config.dataPath + 'boss_%s.txt' % Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
예제 #26
0
                
    def bagItems(self):
        #for link in self.link_list: self.itemPage(link)
        max_th = 10
        if len(self.link_list) > max_th:
            m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th)
        else:
            m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list))
        m_itemsObj.createthread()
        m_itemsObj.putItems(self.link_list)
        m_itemsObj.run()
        self.items.extend(m_itemsObj.items)

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))

if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = GivenchyBag()
    b_url = 'http://www.givenchy.com/cn/'
    b.bagPage(b_url)
    b.bagItems()
    
    f = Config.dataPath + 'givenchy_%s.txt' %Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
 
예제 #27
0
파일: armaniBag.py 프로젝트: xzhoutxd/brand
        i_number = ''
        m = re.search(
            r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>',
            page,
            flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem()
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url,
                   i_img, i_number)
        self.items.append(i.outItem)
        #print '# itemPage :', serie_title, i_name, i_price, i_unit, i_size, i_url, i_img

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))


if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = ArmaniBag()
    b.bagPage()
    b.bagItems()

    f = Config.dataPath + 'armani_%s.txt' % Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
예제 #28
0
파일: armaniBag.py 프로젝트: xzhoutxd/brand
            #i_size = "".join(size_str.split())
            i_size = re.sub(r'\s*','',size_str)
            print "".join(i_size.split())

        i_number = ''
        m = re.search(r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem()
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        self.items.append(i.outItem)    
        #print '# itemPage :', serie_title, i_name, i_price, i_unit, i_size, i_url, i_img

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))

if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = ArmaniBag()
    b.bagPage()
    b.bagItems()
    
    f = Config.dataPath + 'armani_%s.txt' %Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    
예제 #29
0
        self.crawler = MyCrawler()

        # 品牌官网链接
        self.home_url = 'http://www.mcmworldwide.com'
        self.women_url = self.home_url + '/en/women'
        self.bag_url = self.women_url + '/bags'
        self.backpack_url = self.women_url + '/backpacks'
        self.leather_url = self.women_url + '/small-leather-goods'
        self.refers = None

        # 抓取商品列表
        self.links = []
        self.items = []

    def bagPage(self):
        url = self.bug_url + '#start=0&sz=32&srule=New'
        page = self.crawler.getData(self.bag_url, self.women_url)
        if not page or page == '': return


if __name__ == '__main__':
    b = ChanelBag()

    b_url = 'http://www.chanel.com/zh_CN/fashion/products/handbags/g.spring-summer-2015.c.15S.html'
    b.bagPage(b_url)
    b.bagItems()

    f = Config.dataPath + 'chanel_%s.txt' % Common.today_ss()
    print f
    b.outItems(f)
예제 #30
0
        m = re.search(r'<h2 class="sku reading-and-link-text">(.+?)</h2>', page, flags=re.S)
        if m:
            i_number = m.group(1).strip()
        else:
            m = re.search(r'<meta itemprop="identifier" content="sku:(.+?)"/>', page, flags=re.S)
            if m:
                i_number = m.group(1).strip()

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        print '# itemPage:',i.outItem()
        #self.items.append(i.outItem()) 

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))

if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = LouisvuittonBag()
    b_url = "http://www.louisvuitton.cn/zhs-cn/homepage"
    b.bagPage(b_url)
    b.bagItems()
    
    f = Config.dataPath + 'louisvuitton_%s.txt' %Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    
예제 #31
0
        m = re.search(r'<div class="itemDimensions">.+?<span class="dimensions">(.+?)</span></div>', page, flags=re.S)
        if m:
            i_size = m.group(1)

        i_number
        m = re.search(r'<div class="styleIdDescription">货号.+?<span.*?>(.+?)</span></div>', page, flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        print '# itemPage:',i.outItem()
        #self.items.append(i.outItem()) 

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))

if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = YslBag()
    b_url = "http://www.ysl.com/wy/shop-product/%E5%A5%B3%E5%A3%AB"
    b.bagPage(b_url)
    b.bagItems()
    
    f = Config.dataPath + 'ysl_%s.txt' %Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
예제 #32
0
파일: MysqlAccess.py 프로젝트: xzhoutxd/tb
 def insert_parser_item_info(self, args):
     try:
         sql = 'replace into nd_tb_parser_item_info(crawl_time,item_id,item_name,item_price,item_sale,item_url,seller_id,seller_name,shop_id,shop_name,shop_url,brand_id,brand_name,category_id,c_begindate,c_beginhour) values(%s)' % Common.aggregate(
             16)
         self.db.execute(sql, args)
     except Exception, e:
         print '# insert tb shop item exception:', e
예제 #33
0
                    m = re.search(r'"currency-symbol":"(.+?)"', data, flags=re.S)
                    if m: unit = m.group(1)
                if self.item_price != '':
                    if price: i_price += '-' + price
                else:
                    if price: i_price = price
                    if unit: i_unit  = unit
        
        i = BagItem(self.brand_type)
        i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        #print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))


if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = ChanelBag()
    b_url = 'http://www.chanel.com/zh_CN/fashion/products/handbags.html'
    b.bagPage(b_url)
    b.bagItems()
    
    f = Config.dataPath + 'chanel_%s.txt' %Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    
예제 #34
0
파일: MysqlAccess.py 프로젝트: xzhoutxd/tb
 def insert_item(self, args):
     try:
         sql = 'replace into nd_tb_parser_item(crawl_time,item_id,position,item_name,item_price,item_sale,item_url,seller_id,seller_name,shop_url) values(%s)' % Common.aggregate(
             10)
         self.db.execute(sql, args)
     except Exception, e:
         print '# insert tb item exception:', e
예제 #35
0
    def bagItems(self):
        #for link in self.link_list: self.itemPage(link)
        max_th = 10
        if len(self.link_list) > max_th:
            m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th)
        else:
            m_itemsObj = BagItemM(self.home_url, self.brand_type,
                                  len(self.link_list))
        m_itemsObj.createthread()
        m_itemsObj.putItems(self.link_list)
        m_itemsObj.run()
        self.items.extend(m_itemsObj.items)

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))


if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = GivenchyBag()
    b_url = 'http://www.givenchy.com/cn/'
    b.bagPage(b_url)
    b.bagItems()

    f = Config.dataPath + 'givenchy_%s.txt' % Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
예제 #36
0
파일: bagItem.py 프로젝트: xzhoutxd/brand
 def outTuple(self):
     return (Common.time_s(self.crawling_time), self.brand_type, self.serie_title, self.item_title, self.item_name, self.item_price, self.item_unit, self.item_size, self.item_url, self.item_img, self.item_number, self.crawling_beginDate, self.crawling_beginHour)
예제 #37
0
파일: bossBage.py 프로젝트: xzhoutxd/brand
            m = re.search(r'<span.+?>尺寸大小:</span>(.+?)</span>', page, flags=re.S)
            if m:
                i_size = re.sub(r'<.+?>','',m.group(1))

        i_number = ''
        m = re.search(r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>', page, flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        print '# itemPage:',i.outItem()
        #self.items.append(i.outItem()) 

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))

if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = BossBag()
    b_url = "http://store.hugoboss.cn/category.php?id=3835&form_nav"
    b.bagPage(b_url)
    b.bagItems()
    
    f = Config.dataPath + 'boss_%s.txt' %Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
예제 #38
0
        m = re.search(
            r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>',
            page,
            flags=re.S)
        if m:
            i_number = m.group(1).split(':')[1].strip()

        i = BagItem(self.brand_type)
        i.initItem('', item_title, i_name, i_price, i_unit, i_size, i_url,
                   i_img, i_number)
        print '# itemPage:', i.outItem()
        #self.items.append(i.outItem())

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))


if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = DolcegabbanaBag()
    b_url = "http://www.dolcegabbana.com.cn/cn/dolce-gabbana/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B"
    b.bagPage(b_url)
    b.bagItems()

    f = Config.dataPath + 'dolcegabbana_%s.txt' % Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
예제 #39
0
파일: bagItem.py 프로젝트: xzhoutxd/brand
 def outTuple(self):
     return (Common.time_s(self.crawling_time), self.brand_type,
             self.serie_title, self.item_title, self.item_name,
             self.item_price, self.item_unit, self.item_size, self.item_url,
             self.item_img, self.item_number, self.crawling_beginDate,
             self.crawling_beginHour)
예제 #40
0
            i_size += m.group(1) + ":" + m.group(2) + ";"

        i_number
        m = re.search(r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S)
        if m:
            i_number = m.group(1)

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, '', '', i_size, i_url, i_img, i_number)
        print '# itemPage:',i.outItem()
        #self.items.append(i.outItem())    
        #print '# itemPage :', serie_title, i_name, i_url, i_img, i_size

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))

if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = BottegavenetaBag()
    b_url = "http://www.bottegaveneta.com/wy/%E5%A5%B3%E5%A3%AB/onlineboutique/%E6%89%8B%E8%A2%8B"
    b.bagPage(b_url)
    b.bagItems()
    
    f = Config.dataPath + 'bottegaveneta_%s.txt' %Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    
예제 #41
0
            r'<div class="columns-wrapper">.+?<div class="column">.*?<div class="reference">\s*<p>(.+?)</p>\s*</div>',
            page,
            flags=re.S)
        if m:
            s_number = m.group(1)
            i_number = s_number.split('-')[1].strip()

        i = BagItem()
        i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size,
                   i_url, i_img, i_number)
        self.items.append(i.outItem)
        print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))


if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = DiorBag()
    b_url = 'http://www.dior.cn/couture/zh_cn/%E5%A5%B3%E5%A3%AB%E6%97%B6%E8%A3%85/%E7%9A%AE%E5%85%B7'
    b.bagPage(b_url)
    b.bagItems()

    f = Config.dataPath + 'dior_%s.txt' % Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
예제 #42
0
    def bagPage(self, url):
        page = self.crawler.getData(url, self.home_url)
        if not page or page == '': return

        tab_list = []
        m = re.search(r'<p>女装配饰</p>\s+<ul class="menu-level-4">(.+?)</ul>',
                      page,
                      flags=re.S)
        if m:
            tabs_list_info = m.group(1)

            p = re.compile(
                r'<li class="submenu-item">\s+<a.+?href="(.+?)">\s+<span lang="fr" class="lang-fr">(.+?)</span>(.+?)</a>\s+</li>',
                flags=re.S)
            for tab in p.finditer(tabs_list_info):
                tab_list.append(
                    (tab.group(2) + tab.group(3).strip(), tab.group(1)))

        for tab in tab_list:
            tab_name, tab_url = tab
            print '# tab:', tab_name, tab_url
            tab_page = self.crawler.getData(tab_url, url)

            m = re.search(
                r'<div id="layer-1".+?>\s+<a href="(.+?)" class="layer-links">.+?</a>',
                tab_page,
                flags=re.S)
            if m:
                ajax_url = self.home_url + m.group(
                    1) + "?ajax=true&fragment=true"
                ajax_data = self.crawler.getData(ajax_url, tab_url)

                if ajax_data:
                    #data = json.loads(ajax_data)
                    #if data and data.has_key("html"):
                    #    print data["html"].decode("unicode-escape")
                    r_data = ajax_data.decode("unicode-escape")
                    if r_data:
                        m = re.search(r'"html":"(.+?)"}', r_data, flags=re.S)
                        if m:
                            data_html = m.group(1).replace("\/", "/")
                            #print data_html
                            #break
                            p = re.compile(
                                r'<li class="lookbook-item line" data-idlook="\d+">\s+<div class="disp-n">.+?<div class="look-info article">\s+<p>(.+?)</p>.+?<p class="look-ref">(.+?)</p>.+?</div>.+?</div>\s+<a href="(.+?)".+?>.+?<img .+?data-src="(.+?)".*?/>\s+</li>',
                                flags=re.S)
                            for item in p.finditer(data_html):
                                i_url, i_img, s_number, i_name = self.home_url + item.group(
                                    3), item.group(4), item.group(2), re.sub(
                                        r'<.+?>', '', item.group(1)).strip()
                                i_number = ''
                                m = re.search(
                                    r'<span class="look-ref-sku">\s*<span.+?>(.+?)</span>\s*</span>',
                                    s_number,
                                    flags=re.S)
                                if m:
                                    i_number = m.group(1)
                                print i_url, i_img, i_name, i_number
                                if Common.isBag(i_name):
                                    self.link_list.append(
                                        (tab_name, tab_url, i_name, i_url,
                                         i_img, i_number))
예제 #43
0
파일: Item.py 프로젝트: xzhoutxd/tb
 def outItemSql(self):
     return (Common.time_s(self.crawling_time),self.item_id,self.item_name,self.item_price,self.item_sellCount,self.item_url,self.seller_id,self.seller_name,self.shop_id,self.shop_name,self.shop_url,self.brand_id,self.brand_name,self.category_id,self.crawling_beginDate,self.crawling_beginHour)
예제 #44
0
        i_number = ''
        m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>',
                      page,
                      flags=re.S)
        if m:
            i_size, i_number = m.group(1).strip(), m.group(2).strip()

        i = BagItem(self.brand_type)
        i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url,
                   i_img, i_number)
        print '# itemPage:', i.outItem()
        #self.items.append(i.outItem())

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))


if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = FerragamoBag()
    b_url = "http://www.ferragamo.cn/woman/handbags/"
    b.bagPage(b_url)
    b.bagItems()

    f = Config.dataPath + 'ferragamo_%s.txt' % Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
예제 #45
0
            i_img = m.group(1)

        i_size = ""
        i_number = ""
        m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>', page, flags=re.S)
        if m:
            i_size, i_number = m.group(1).strip(), m.group(2).strip()

        i = BagItem(self.brand_type)
        i.initItem(serie_title, "", i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        print "# itemPage:", i.outItem()
        # self.items.append(i.outItem())

    def outItems(self, f):
        s = "#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号"
        with open(f, "w") as f_item:
            self.items.insert(0, s)
            f_item.write("\n".join(self.items))


if __name__ == "__main__":
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
    b = FerragamoBag()
    b_url = "http://www.ferragamo.cn/woman/handbags/"
    b.bagPage(b_url)
    b.bagItems()

    f = Config.dataPath + "ferragamo_%s.txt" % Common.today_ss()
    b.outItems(f)
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
예제 #46
0
            m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺码.+?)</li>', page, flags=re.S)
            if m:
                i_size = m.group(1)

        i_number = ''
        m = re.search(r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>', page, flags=re.S)
        if m:
            i_number = m.group(1).split(':')[1].strip()

        i = BagItem(self.brand_type)
        i.initItem('', item_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        print '# itemPage:',i.outItem()
        #self.items.append(i.outItem()) 

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))

if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = DolcegabbanaBag()
    b_url = "http://www.dolcegabbana.com.cn/cn/dolce-gabbana/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B"
    b.bagPage(b_url)
    b.bagItems()
    
    f = Config.dataPath + 'dolcegabbana_%s.txt' %Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
예제 #47
0
파일: diorBag.py 프로젝트: xzhoutxd/brand
                if m: i_size = m.group(1).strip()

        i_number = ''
        m = re.search(r'<div class="columns-wrapper">.+?<div class="column">.*?<div class="reference">\s*<p>(.+?)</p>\s*</div>', page, flags=re.S)
        if m:
            s_number = m.group(1)
            i_number = s_number.split('-')[1].strip()
                
        i = BagItem()
        i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
        self.items.append(i.outItem)    
        print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img

    def outItems(self, f):
        s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号'
        with open(f, 'w') as f_item:
            self.items.insert(0, s)
            f_item.write('\n'.join(self.items))

if __name__ == '__main__':
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    b = DiorBag()
    b_url = 'http://www.dior.cn/couture/zh_cn/%E5%A5%B3%E5%A3%AB%E6%97%B6%E8%A3%85/%E7%9A%AE%E5%85%B7'
    b.bagPage(b_url)
    b.bagItems()
    
    f = Config.dataPath + 'dior_%s.txt' %Common.today_ss()
    b.outItems(f)
    print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))