Beispiel #1
0
 def crawl(self):
     url = "http://mobile.gome.com.cn/mobile/product/allCategorys.jsp"
     jsons = ProcessData.get_json_data(url)
     if jsons == {}:
         return {}
     category1 = jsons['firstLevelCategories']
     for first_item in category1:
         name1 = first_item['goodsTypeName']  #1 lev name
         try:
             category2 = first_item['goodsTypeList']
         except:
             pass
         for second_item in category2:
             name2 = second_item['goodsTypeName']
             try:
                 category3 = second_item['goodsTypeList']
             except:
                 pass
             for third_item in category3:
                 try:
                     third_id = third_item['goodsTypeId']
                     name3 = third_item['goodsTypeLongName']
                 except:
                     pass
                 priorcategory = []
                 priorcategory.append(name1)
                 priorcategory.append(name2)
                 priorcategory.append(name3)
                 data = {'priorcategory': priorcategory}
                 # if name3 != u"冰箱" and name3 != u"空调":
                 #     continue
                 Scheduler.schedule(ListCrawler.type,
                                    key=third_id,
                                    data=data,
                                    interval=86400)
Beispiel #2
0
    def run(self):
        while True:
            if not self.runValue.value:
                print "%s stops" % self.name
                break
            self.signalget()
            start_t = time.time()
            self.ctask = self.taskqueue.get()
            if self.ctask.empty:
                time.sleep(10)
                continue
            end_t = time.time()
            self.log_wait_task(end_t - start_t)
            self.log_get_task()
            start_t = time.time()

            c = Crawler().create(self.ctask.type, self.ctask.key, self.ctask.data)
            if c:
                try:
                    c.crawl()
                    success = True
                    logger.info("CRAWL SUCCEED - <%s> %s" % (self.taskqueue.queueid, c))
                    end_t = time.time()
                    self.log_done_task(end_t - start_t)
                except Exception:
                    msg = get_exception_info()
                    success = False
                    logger.error("CRAWL FAILED - <%s> %s, %s" % (self.taskqueue.queueid, c, msg))
            else:
                logger.error("CRAWL FAILED - <%s> %s" % (self.taskqueue.queueid, self.ctask))
                success = False

            Scheduler.finish(self.ctask.type, self.ctask.key, c.data if c else {}, success)
Beispiel #3
0
    def init(conf=None):

        from xlutils.copy import copy
        import xlrd
        import os

        SRC_PATH = os.path.dirname(__file__)

        bk = xlrd.open_workbook(os.path.join(SRC_PATH,
                                             "../../file/weixin.xls"))
        sh = bk.sheet_by_name('Sheet1')
        nrows = sh.nrows
        ncols = sh.ncols
        for i in xrange(1, nrows):
            data = {}
            data = {
                'publisher': sh.cell_value(i, 0).strip(),
                'province': sh.cell_value(i, 1).strip(),
                'city': sh.cell_value(i, 2).strip(),
                'district': sh.cell_value(i, 3).strip()
            }
            key = sh.cell_value(i, 6).strip()
            Scheduler.schedule(FirstCrawler.type,
                               key=key,
                               data=data,
                               interval=28800,
                               reset=True)
Beispiel #4
0
    def crawl(self):
        json_data = ProcessData.get_json_data(self.get_json_url(self.key))
        is_Bbc = self.get_is_Bbc(json_data)
        status = self.get_status(json_data)
        response = self.get_response(self.key)
        tree = etree.HTML(response.text)
        info = self.get_info(tree)
        crawl_data = {
            "source": self.data["source"],
            "source_id": self.key,
            "status": status,
            "comment": {
                "is_Bbc": is_Bbc,
            },
        }
        crawl_data.update(info)
        crawl_data.update(extract_category(self))
        crawl_data.update(get_ctime())
        model = EcDetailModel(crawl_data)
        export(model)

        comment_data = {
            "uuid": model["id"],
            "status": model["status"],
            "version": model["version"],
            "series": model["series"],
            "brand": model["brand"],
            "is_Bbc": model["comment"]["is_Bbc"],
        }
        Scheduler.schedule(CommentCrawler.type,
                           key=self.key,
                           data=comment_data)
Beispiel #5
0
 def init(conf=None):
     from xlutils.copy import copy
     import xlrd
     import os
     SRC_PATH = os.path.dirname(__file__)
     bk = xlrd.open_workbook(os.path.join(SRC_PATH,
                              "../../file/newyuqing.xls"))
     sh = bk.sheet_by_name('Sheet1')
     nrows = sh.nrows
     ncols = sh.ncols
     for i in range(1,nrows):
         data = {}
         types = sh.cell_value(i,1).strip()
         province = sh.cell_value(i,2).strip()
         city = sh.cell_value(i,3).strip()
         district = sh.cell_value(i,4).strip()
         data = {
             'type': types,
             'province': province,
             'city': city,
             'district': district,
             'publisher': (province+city+district+types)
         }
         key =  sh.cell_value(i,5).strip()    
         # print data['publisher'].encode('utf-8') 
         if key == '':
             continue
         Scheduler.schedule(FirstCrawler.type ,key=key, 
                             data=data, interval=14800, reset=True)
Beispiel #6
0
    def crawl(self):
        global COOKIE
        category_data = extract_category(self)
        response = self.get_response(self.key)
        if COOKIE != response.headers.get("set-cookie", ""):
            COOKIE = response.headers.get("set-cookie", "")
        tree = etree.HTML(response.text)
        info = self.get_info(tree)

        crawl_data = {
            'source': "amazon",
            'source_id': self.key,
            'status': 1,
        }

        crawl_data.update(info)
        crawl_data.update(category_data)
        crawl_data.update(get_ctime())
        model = EcDetailModel(crawl_data)
        export(model)
        comment_data = {
            "uuid": model["id"],
            "brand": model["brand"],
            "version": model["version"],
            "series": model["series"],
            "is_Bbc": model["comment"]["is_Bbc"],
            'status': model["status"],
        }
        Scheduler.schedule(CommentCrawler.type,
                           key=self.key,
                           data=comment_data)
Beispiel #7
0
    def crawl(self):
        # fid = '1662'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        fid = self.key
        category_data = extract_category(self)

        count = 3 #页数初始值为3
        pages = 1 #从第一页开始

        while pages <= count:
            url = self.get_url(fid,pages)
            try:
                jsons = ProcessData.get_json_data(url)
                if pages==1 : count = math.ceil(int(jsons['wareCount'])/100)
                lists = jsons['wareInfo']
            except Exception,e:
                self.logger.error(url)
                self.logger.error(e)
                print 'error ',url
                return
            if lists == []:
                return {}
            for i in range(len(lists)):
                ids = uuid.uuid1() #cassandra 主键
                wareId = lists[i]['wareId']

                try:
                    f = lambda x: int(x[:-1])/100.00
                    ecsumscores = float(f(lists[i]['good'])) #商品总评分
                except:
                    ecsumscores = 0

                crawl_data = {
                    # 'id': uuid.uuid1(),
                    'source_id': wareId,
                    'source': self.data.get('source'),
                    'summary': {},
                    'title': lists[i]['wname'],
                    'adword': lists[i]['adword'],
                    'price': float(lists[i]['jdPrice']),
                    'original_price': float(lists[i]['martPrice']),
                    'score': ecsumscores
                }
                crawl_data.update(category_data)
                data = {
                    # 'uuid': ids,
                    'priorcategory': self.data['priorcategory'],
                    'presentcategory': self.data['priorcategory']
#                    'presentcategory': self.data['presentcategory']
                }

                model = EcBasicModel(crawl_data)
                export(model)
                data["uuid"] = model["id"]
                Scheduler.schedule(DetailCrawler.type, key=wareId, data=data)
                Scheduler.schedule(CommentCrawler.type, key=wareId, data=data)


            pages += 1
Beispiel #8
0
 def init(conf=None):
     from xlutils.copy import copy
     import xlrd
     import os
     SRC_PATH = os.path.dirname(__file__)
     bk = xlrd.open_workbook(
         os.path.join(SRC_PATH, "../../file/newyuqing.xls"))
     sh = bk.sheet_by_name('Sheet1')
     nrows = sh.nrows
     ncols = sh.ncols
     for i in range(1, nrows):
         data = {}
         types = sh.cell_value(i, 1).strip()
         province = sh.cell_value(i, 2).strip()
         city = sh.cell_value(i, 3).strip()
         district = sh.cell_value(i, 4).strip()
         data = {
             'type': types,
             'province': province,
             'city': city,
             'district': district,
             'publisher': (province + city + district + types)
         }
         key = sh.cell_value(i, 5).strip()
         # print data['publisher'].encode('utf-8')
         if key == '':
             continue
         Scheduler.schedule(FirstCrawler.type,
                            key=key,
                            data=data,
                            interval=14800,
                            reset=True)
Beispiel #9
0
    def crawler_data(self,tree):
        category_data = extract_category(self)

        XPATH = self.search_list_xpath
        if len(tree.xpath(XPATH('list'))) == 0:
            XPATH = self.product_list_xpath
        dom = tree.xpath(XPATH('list'))
        for item in dom:
            crawl_data = {}
            craw = [
                'title','adword',
                'price','original_price',
                'source_id','score',
            ]

            for value in craw: 
                crawl_data[value] = self.mackining(item.xpath(XPATH(value)))
            crawl_data['price'] = float(crawl_data['price'])
            try:
                f = lambda x: int(x[:-1])/100.00
                crawl_data['score'] = float(f(crawl_data['score']))
            except:
                crawl_data['score'] = 0
            crawl_data.update(category_data)
            crawl_data['source'] = 'yhd'
            model = EcBasicModel(crawl_data)
            export(model)
            data = {
                'priorcategory': self.data['priorcategory'],
                'presentcategory': self.data['priorcategory']           
            }            
            data["uuid"] = model["id"]
            Scheduler.schedule(DetailCrawler.type, key=str(self.key), data=data)
Beispiel #10
0
    def crawl(self):
        catId = str(self.key)

        category_data = extract_category(self)
        totalpage = self.get_page(catId)
        if totalpage == 0:
            return {}
        for i in range(1, totalpage + 1):
            url = self.get_url(catId, i)
            jsons = ProcessData.get_json_data(url)
            try:
                goodsList = jsons['goodsList']
            except Exception, e:
                self.logger.error(url)
                self.logger.error(e)
                print "get goodsList fail"

            for j in range(len(goodsList)):
                goods = goodsList[j]
                goodsNo = goods['goodsNo']
                goodsName = goods['goodsName']
                skuID = goods['skuID']

                goods_find = self.has_goods(goodsNo)
                if not goods_find:
                    data = {
                        'priorcategory': self.data['priorcategory'],
                        'skuID': skuID,
                    }
                    Scheduler.schedule(DetailCrawler.type,
                                       key=goodsNo,
                                       data=data)
                    continue
                adword = self.extract_adword(goods['ad'])
                crawl_data = {
                    'id': goods_find['uuid'],
                    'source_id': goodsNo,
                    'source': self.data.get('source'),
                    'title': goods['goodsName'],
                    'adword': adword,
                    'status': goods_find['status'],
                    'price': float(goods['lowestSalePrice']),
                    'brand': goods_find['brand'],
                    'version': goods_find['version'],
                    'series': goods_find['series'],
                    'comment': {
                        'is_Bbc': goods_find['isBbc'],
                        'skuId': goods_find['skuID'],
                    },
                }
                crawl_data.update(category_data)
                crawl_data.update(get_ctime())
                model = EcBasicModel(crawl_data)
                export(model)
Beispiel #11
0
    def crawl(self):
        global COOKIE
        keyid = self.key
        category_data = extract_category(self)
        priorcategory = self.data["priorcategory"]
        count = 3
        page = 1  # 从第一页开始
        while page <= count:
            url = self.get_url(keyid, page)
            html_stream = ProcessData.get_web_data(url)
            if COOKIE != html_stream.headers.get("set-cookie", ""):
                COOKIE = html_stream.headers.get("set-cookie", "")
            html = etree.HTML(html_stream.content)
            if page == 1:
                count = self.getPageSize(html)
            items = html.xpath(self.xpath["item"])
            if not len(items):
                if html.xpath("//input[@id='captchacharacters']"):
                    time.sleep(random.randint(1, 3))
                    continue
                else:
                    self.remove_task(keyid)

            for item in items:
                source_id = self.get_source_id(item)
                task_data = self.has_goods(source_id)
                if not task_data:
                    data = {
                        'priorcategory': priorcategory,
                    }
                    Scheduler.schedule(DetailCrawler.type,
                                       key=source_id,
                                       data=data)
                else:
                    info = self.get_info(item)
                    crawl_data = {
                        'id': task_data["uuid"],
                        'source_id': source_id,
                        'source': "amazon",
                        'brand': task_data["brand"],
                        'version': task_data["version"],
                        'series': task_data["series"],
                        'status': task_data["status"],
                        "comment": {
                            "is_Bbc": task_data["is_Bbc"],
                        }
                    }
                    crawl_data.update(info)
                    crawl_data.update(category_data)
                    crawl_data.update(get_ctime())
                    model = EcBasicModel(crawl_data)
                    export(model)
            page += 1
Beispiel #12
0
 def crawl(self):
     homepage = "http://www.jxzj.gov.cn/jxzj/index.html"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+(news).+\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Beispiel #13
0
    def getchildurl(self, url, data={}):
        html_stream = _get_url(url)

        for item in HandleUrl.get_url(html_stream.text):
            text = '^(http|https).+(news)\/(zjpd|xfpd|zhuanti|zgzlb).+\d\.(htm|html|net)$'
            url_t = re.match(text, item)
            if url_t != None:
                # ContentCrawler(key=item).crawl()
                # print item
                Scheduler.schedule(ContentCrawler.type, key=item, data=data)
            else:
                pass
Beispiel #14
0
 def getchildurl(self, url,data={}):
     html_stream = _get_url(url)
 
     for item in HandleUrl.get_url(html_stream.text):
         text = '^(http|https).+(news)\/(zjpd|xfpd|zhuanti|zgzlb).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         if url_t != None:
             # ContentCrawler(key=item).crawl()
             # print item
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Beispiel #15
0
 def crawl(self):
     homepage = "http://www.hzqts.gov.cn/zwpd/index.htm"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item, homepage)
         text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             #  ContentCrawler(key=item).crawl()
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Beispiel #16
0
 def crawl(self):
     homepage = "http://www.hzqts.gov.cn/zwpd/index.htm"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
           #  ContentCrawler(key=item).crawl()
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Beispiel #17
0
 def crawl(self):
     homepage = "http://www.hbzljd.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             # print item.encode('utf-8')
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Beispiel #18
0
 def crawl(self):
     homepage = "http://www.gzq.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item, homepage)
         text = '^(http|https).+(public).+.+\d$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             # print item.encode('utf-8')
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Beispiel #19
0
 def crawl(self):
     homepage = self.key
     data = self.data
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         if re.search('(ndex)',item):
             continue
         text = '^(http|https).+\d\.(htm|html|net|php)$'
         url_t = re.match(text, item)
         if url_t != None:
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Beispiel #20
0
 def crawl(self):
     homepage = self.key
     data = self.data
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item, homepage)
         if re.search('(ndex)', item):
             continue
         text = '^(http|https).+\d\.(htm|html|net|php)$'
         url_t = re.match(text, item)
         if url_t != None:
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Beispiel #21
0
 def crawl(self):
     homepage = "http://www.fsjsjd.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         xp_putime = "//a[@href='%s']/parent::*/text()"%item
         pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             data['pubtime'] = pubtime
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Beispiel #22
0
 def crawl(self):
     homepage = "http://www.fsjsjd.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         xp_putime = "//a[@href='%s']/parent::*/text()" % item
         pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
         item = HandleUrl.judge_url(item, homepage)
         text = '^(http|https).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             data['pubtime'] = pubtime
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Beispiel #23
0
 def crawl(self):
     homepage = "http://www.bjtsb.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
        # print '----',item
         item = HandleUrl.judge_url(item,homepage)
       #  print '====',item
         text = ur'(http).+(infoview).+\d{3,8}$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             # print item.encode('utf-8')
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Beispiel #24
0
class ThirdCrawler(Crawler):
    type = "ecommerce.yhd.thirdlvl"

    def crawl(self):
        cid = str(self.key)
        categorys = self.data['priorcategory']
        url = "http://interface.m.yhd.com/\
               mcategory/servlet/CentralMobileFacadeJsonServlet/\
               getNavCategoryWithKeywordByRootCategoryId?rootCategoryId=\
               %s&categoryNavId=0&provinceId=1" %(cid)
        try:
            jsons = ProcessData.get_json_data(url.replace(' ',''))
            data = jsons['data']
        except Exception,e:
            self.logger.error(url)
            self.logger.error(e)
            print 'error ',url
        for item in data:
            priorcategory = []
            priorcategory.extend(categorys)
            priorcategory.append(item['categoryName'])            
            if item.has_key('boundCategoryId'):
                keys = item['boundCategoryId']
            else:
                continue
            data = {
                'priorcategory':priorcategory,
            }  
            # if priorcategory[2] != u"冰箱" and priorcategory[2] != u"空调":
            #     continue 
            Scheduler.schedule(ListCrawler.type, key=keys, data=data, interval=86400)
Beispiel #25
0
 def is_detail_done(self):
     terms = {
         "type": DetailCrawler.type,
         "$or": [{"status": 1},{"status": 0}],
     }
     result = Scheduler.find_one(DetailCrawler.type, terms)
     return False if result else True
Beispiel #26
0
 def crawl(self):
     world = self.key
     data = self.data
     #  world = str(self.key)
     data.update({'type': u'元搜索', 'origin_source': u'微信搜索', 'key': world})
     homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\
                 type=2&t=1427703547684&s_t=&fr=sgsearch&\
                 query=" + world + "&pg=webSearchList"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)
     list_url = []
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item)
         if item == '':
             continue
         else:
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
Beispiel #27
0
 def is_first(self, key):
     terms = {
         "type": ListCrawler.type,
         "key": key,
         "lastrun": datetime.min,
     }
     result = Scheduler.find_one(ListCrawler.type, terms)
     return True if result else False
Beispiel #28
0
    def save_list(self, items, **args):
        for item in items:
            source_id = self.mackining(item.xpath(self.search_list_xpath('source_id')))
            if not source_id:
                continue
            task_data = self.has_goods(source_id)
            if not task_data:
                data = {
                    'priorcategory': self.data['priorcategory'],
                }
                Scheduler.schedule(DetailCrawler.type, key=source_id, data=data)
                continue                

            crawl_data = self.get_crawl_data(item, category_data=args['category_data']
                , source_id=source_id, task_data=task_data)
            model = EcBasicModel(crawl_data)
            export(model)
Beispiel #29
0
 def crawl(self):
     worlds = str(self.key)
     world = '+'.join(worlds.split(','))
     data = self.data
     homepage = "http://news.baidu.com/ns?ct=0&rn=20&ie=utf-8&bs=" + world + "&\
                 rsv_bp=1&sr=0&cl=2&f=8&prevct=no&tn=news&word=" + world
     # homepage = "http://news.baidu.com/ns?ct=0&rn=20&ie=utf-8&bs=intitle:\
     #             ("+world+")&rsv_bp=1&sr=0&cl=2&f=8&\
     #             prevct=no&tn=newstitle&word="+world
     homepage = clear_space(homepage)
     html_stream = _get_url(str(homepage))
     xp_content = "//div[@id='content_left']/ul/li"
     items = HandleContent.get_item(html_stream, xp_content)
     xp_title = "h3[@class='c-title']//text()"
     xp_str = "div//p[@class='c-author']/text()"
     #xp_str = "div[@class='c-title-author']/text()"
     xp_url = "h3[@class='c-title']/a/@href"
     xp_count = "div//span[@class='c-info']/a[@class='c-more_link']/text()"
     for item in items:
         date = new_time()
         title = HandleContent.get_context(item, xp_title, text=True)
         pt_text = HandleContent.get_context(item, xp_str, text=True)
         publisher = HandleContent.get_author(pt_text, xp_text='', STR=True)
         pubtime = HandleContent.find_pubtime(pt_text)
         pubtime = local2utc(pubtime) if pubtime else date.get('utctime')
         url = HandleContent.get_context(item, xp_url, text=True)
         count = HandleContent.get_context(item, xp_count, text=True)
         try:
             count = int(count.split(u'条相同新闻', 1)[0]) if count else 0
         except:
             count = 0
         crawl_data = {}
         crawl_data = {
             #    'url': url,
             'title': title,
             'pubtime': pubtime,
             'source': u'baidu',
             'publisher': publisher,
             'count': str(count),
             'key': world,
             'source_type': data.get('source_type', ''),
         }
         # print title,url
         Scheduler.schedule(ContentCrawler.type, key=url, data=crawl_data)
Beispiel #30
0
 def crawl(self):
     key = str(self.key)
     data = self.data
     homepage = "http://m.weibo.cn/p/index?containerid=100103type%3D36%26q%3D"+key+\
                 "%26weibo_type%3Dlongwb&title=%E9%95%BF%E5%BE%AE%E5%8D%9A"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)
     time.sleep(random.randint(0, 5))
     url_list = re.findall(r"(?<=scheme\":\").+?(?=\")", html_stream.text)
     data.update({'key': key})
     for item in url_list:
         item = unquote(item)
         cid = re.search(r'.+\/p\/(.+?)\?.+', item)
         if cid:
             Scheduler.schedule(TopicCrawler.type,
                                key=cid.group(1),
                                data=data)
         else:
             continue
Beispiel #31
0
 def has_goods(self, key):
     terms = {
         "type": CommentCrawler.type,
         "$and":[
             {"data.source_id": key},
             {"data.brand": {"$exists": True}},
         ],
     }
     result = Scheduler.find_one(CommentCrawler.type, terms)
     return result["data"] if result else None
Beispiel #32
0
 def crawl(self):
     catId = str(self.key)
     category_data = extract_category(self)
     totalpage = self.get_page(catId)
     if totalpage == 0:
         return {}
     for i in range(1, totalpage + 1):
         url = self.get_url(catId, i)
         jsons = ProcessData.get_json_data(url)
         try:
             goodsList = jsons['goodsList']
         except Exception, e:
             self.logger.error(url)
             self.logger.error(e)
             print "get goodsList fail"
         for j in range(len(goodsList)):
             goods = goodsList[j]
             goodsName = goods['goodsName']
             goodsNo = goods['goodsNo']
             skuID = goods['skuID']
             # print goodsNo
             # print skuID
             crawl_data = {
                 # 'id': uuid.uuid1(),
                 'source_id': goodsNo,
                 'source': self.data.get('source'),
                 'title': goods['goodsName'],
                 'adword': goods['ad'],
                 'price': float(goods['lowestSalePrice']),
                 'original_price': float(goods['highestSalePrice']),
                 #'score': ecsumscores
             }
             crawl_data.update(category_data)
             model = EcBasicModel(crawl_data)
             export(model)
             data = {
                 'priorcategory': self.data['priorcategory'],
                 'presentcategory': self.data['priorcategory']
             }
             data["uuid"] = model["id"]
             Scheduler.schedule(DetailCrawler.type, key=goodsNo, data=data)
             Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=data)
Beispiel #33
0
 def crawl(self):
     key = str(self.key)
     data = self.data
     homepage = "http://m.weibo.cn/p/index?containerid=100103type%3D36%26q%3D"+key+\
                 "%26weibo_type%3Dlongwb&title=%E9%95%BF%E5%BE%AE%E5%8D%9A"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)
     time.sleep(random.randint(0,5))
     url_list = re.findall(r"(?<=scheme\":\").+?(?=\")", 
             html_stream.text)
     data.update({
         'key': key
     })
     for item in url_list:
         item = unquote(item)
         cid = re.search(r'.+\/p\/(.+?)\?.+', item)
         if cid:
             Scheduler.schedule(TopicCrawler.type, key=cid.group(1), data=data)
         else:
             continue
Beispiel #34
0
    def crawl(self):
        world = self.key
        data = self.data
        homepage = "http://api.weibo.cn/2/profile?\
                    gsid=_2A254IZdKDeTxGeRM7lUR8CnKyT2IHXVZdq2CrDV6PUJbrdAKLUf7kWptw4_No8F1OjQMCarBH4hZxZcrwA..&\
                    wm=3333_2001&i=27bd163&b=1&from=1051293010&c=iphone&v_p=18&skin=default&\
                    v_f=1&s=d2672a12&luicode=10000194&uid="+str(world)
        # homepage = "http://api.weibo.cn/2/profile?gsid=4wMJ47123kZuG0fKGxlRC15McKa50&uid="+str(world)+"&\
        #             wm=3333_2001&i=27bd163&b=0&from=1052093010&checktoken=c54259b09129d101b9669b5d93a04c0e&\
        #             c=iphone&v_p=18&skin=default&v_f=1&s=8a12fc6c&did=38d63734cc7427ebb2cb77612c1948cf&\
        #             lang=zh_CN&ua=iPhone7,2__weibo__5.2.0__iphone__os8.2&uicode=10000198&uid="+str(world)+\
        #             "&featurecode=10000085&luicode=10000003"

        homepage = clear_space(homepage)
        html_stream = _get_url(homepage)
        json_stream = change_to_json(str(html_stream.text))
        containerid = json_stream['tabsInfo']['tabs'][1]['containerid']
        data['id'] = str(world)
        Scheduler.schedule(ContentCrawler.type, key=containerid, data=data,
                             reset=True, interval=10800)
Beispiel #35
0
 def crawl(self):
     page_size = 0
     page = 0
     while page <= page_size:
         url = self.get_url(self.key, page)
         json_data = ProcessData.get_json_data(url)
         if page == 0:
             page_size = self.get_page_size(json_data)
         for goods in json_data["goods"]:
             source_id = goods["partnumber"]
             task_data = self.has_goods(self.key)
             if not task_data:
                 data = {
                     "priorcategory": self.data["priorcategory"],
                     "status": 1 if int(goods["saleStatus"]) == 0 else 0,
                 }
                 Scheduler.schedule(DetailCrawler.type,
                                    key=source_id,
                                    data=data)
             else:
                 crawl_data = {
                     "id": task_data["uuid"],
                     "source": self.data["source"],
                     "source_id": source_id,
                     "title": goods["catentdesc"],
                     "adword":
                     extract_adword(goods.get("auxdescription", "")),
                     "price": float(goods["price"]),
                     'status': task_data['status'],
                     'brand': task_data['brand'],
                     'version': task_data['version'],
                     'series': task_data['series'],
                     'comment': {
                         'is_Bbc': task_data['is_Bbc'],
                     },
                 }
                 crawl_data.update(category_data)
                 crawl_data.update(get_ctime())
                 model = EcBasicModel(crawl_data)
                 export(model)
         page += 1
Beispiel #36
0
    def crawl(self):
        wareId = str(self.key)
        url = "http://item.yhd.com/item/%s"%wareId
        html_stream = ProcessData.get_web_data(url)
        tree = etree.HTML(html_stream.text)
        crawl_data = self.crawler_data(tree)
        product_id = self.parse_productId(tree)
        model = EcDetailModel(crawl_data)
        export(model)

        comment_data = {
            'uuid': model['id'],
            'status': crawl_data['status'],
            'brand': brand,
            'series': series,
            'version': version,
            'is_Bbc': crawl_data['comment']['is_Bbc'],
            'priorcategory': self.data['priorcategory'],
            'source_id': wareId,
        }
        Scheduler.schedule(CommentCrawler.type, key=product_id, data=comment_data)
Beispiel #37
0
    def init(conf=None):

        from xlutils.copy import copy
        import xlrd
        import os
        SRC_PATH = os.path.dirname(__file__)
        bk = xlrd.open_workbook(os.path.join(SRC_PATH,
                                 "../../file/weibo.xls"))
        sh = bk.sheet_by_name('Sheet1')
        nrows = sh.nrows
        ncols = sh.ncols
        for i in xrange(1,nrows):
            data = {}
            data = {
                'publisher': sh.cell_value(i,3).strip(),
                'province': sh.cell_value(i,0).strip(),
                'city': sh.cell_value(i,1).strip(),
                'district': sh.cell_value(i,2).strip()
            }
            key = str(int(sh.cell_value(i,5))).strip()        
            Scheduler.schedule(FirstCrawler.type ,key=key, 
                                data=data, interval=3600, reset=True)
Beispiel #38
0
 def crawl(self):
     world = self.key
     data = self.data
   #  world = str(self.key)
     data.update({
             'type': u'元搜索',
             'origin_source': u'微信搜索',
             'key': world
     })
     homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\
                 type=2&t=1427703547684&s_t=&fr=sgsearch&\
                 query="+world+"&pg=webSearchList"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)
     list_url = []
     for item in HandleUrl.get_url(html_stream.text):
         item  = HandleUrl.judge_url(item)
         if item == '':
             continue
         else:
             Scheduler.schedule(ContentCrawler.type, key=item,
                                  data=data)
Beispiel #39
0
 def crawl(self):
     url = "http://mobile.gome.com.cn/mobile/product/allCategorys.jsp"
     jsons = ProcessData.get_json_data(url)
     if jsons == {}:
         return {}
     category1 = jsons['firstLevelCategories']
     for first_item in category1:
         name1 = first_item['goodsTypeName']  #1 lev name
         try:
             category2 = first_item['goodsTypeList']
         except:
             pass
         for second_item in category2:
             name2 = second_item['goodsTypeName']
             #print name
             try:
                 category3 = second_item['goodsTypeList']
             except:
                 pass
             for third_item in category3:
                 try:
                     third_id = third_item['goodsTypeId']
                     name3 = third_item['goodsTypeLongName']
                 except:
                     pass
                 # print third_id
                 # print name3.encode('utf-8')
                 priorcategory = []
                 priorcategory.append(name1)
                 priorcategory.append(name2)
                 priorcategory.append(name3)
                 #presentcategory = priorcategory
                 data = {
                     'priorcategory': priorcategory
                     #'presentcategory':presentcategory
                 }
                 Scheduler.schedule(ListCrawler.type,
                                    key=third_id,
                                    data=data)
Beispiel #40
0
    def run(self):
        while True:
            if not self.runValue.value:
                print "%s stops" % self.name
                break
            self.signalget()
            start_t = time.time()
            self.ctask = self.taskqueue.get()
            if self.ctask.empty:
                time.sleep(10)
                continue
            end_t = time.time()
            self.log_wait_task(end_t - start_t)
            self.log_get_task()
            start_t = time.time()

            c = Crawler().create(self.ctask.type, self.ctask.key,
                                 self.ctask.data)
            if c:
                try:
                    c.crawl()
                    success = True
                    logger.info("CRAWL SUCCEED - <%s> %s" %
                                (self.taskqueue.queueid, c))
                    end_t = time.time()
                    self.log_done_task(end_t - start_t)
                except Exception:
                    msg = get_exception_info()
                    success = False
                    logger.error("CRAWL FAILED - <%s> %s, %s" %
                                 (self.taskqueue.queueid, c, msg))
            else:
                logger.error("CRAWL FAILED - <%s> %s" %
                             (self.taskqueue.queueid, self.ctask))
                success = False

            Scheduler.finish(self.ctask.type, self.ctask.key,
                             c.data if c else {}, success)
Beispiel #41
0
 def crawl(self):
     CatID = self.key
     category_data = extract_category(self)
     page = 1
     page_count = 1
     while page <= page_count:
         jsons = self.get_response(CatID, page)
         if page == 1: page_count = self.get_page_count(jsons)
         for goods in jsons['ProductListItems']:
             source_id = goods["Code"]
             task_data = self.has_goods(source_id)
             if task_data:
                 crawl_data = {
                     "id": task_data["uuid"],
                     "title": goods["Title"],
                     "price": goods["Price"]["CurrentPrice"],
                     "source_id": source_id,
                     "source": self.data["source"],
                     "status": task_data["status"],
                     "brand": task_data["brand"],
                     "version": task_data["version"],
                     "series": task_data["series"],
                     "comment": {
                         "is_Bbc": task_data["isBbc"],
                     },
                 }
                 crawl_data.update(category_data)
                 crawl_data.update(get_ctime())
                 model = EcBasicModel(crawl_data)
                 export(model)
             else:
                 detail_data = {
                     "priorcategory": self.data["priorcategory"],
                 }
                 Scheduler.schedule(DetailCrawler.type,
                                    key=source_id,
                                    data=detail_data)
         page += 1
Beispiel #42
0
 def crawl(self):
     global cookies
     global clocking
     global STATUS_CK
     TIME = time.time()
     hour = time.strftime('%H',time.localtime(TIME))
     if cookies == [] and TIME > STATUS_CK:
         print 'wait-----------To obtain cookie one '
         STATUS_CK = TIME + 35200
         clocking = hour
         cookies = get_cookies()
     elif not cookies:
         print 'Gets a cookies failure'
         # STATUS_CK = TIME + 35200
         return 
     elif int(hour)%2 == 0 and clocking != hour:
         print 'wait-----------To obtain cookie'
         clocking = hour
         cookies = []
         cookies = get_cookies()
     
     cookie = random.choice(cookies)
     world = self.key
     data = self.data
     homepage = "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=%s&repp=1"%str(world)
     html_stream = _get_url(homepage ,cookie=cookie)
     # re.findall(")")
     mtre = "sogou.weixin.gzhcb\((.*)\)"
     match = re.search(mtre, html_stream.text).group(1)
     all_xml = change_to_json(str(match)).get('items',{})
     for item in all_xml:
         item = item.replace('\"gbk\"','\"utf-8\"')
         root = ElementTree.fromstring(item)
         geturl = root.getiterator('url')[0]
         Scheduler.schedule(ContentCrawler.type, key=geturl.text,
                              data=data)
     time.sleep(random.randint(30,100))
Beispiel #43
0
    def crawl(self):

        # fid = '1620'
        # categorys = ["家居家装"]

        fid = self.key
        categorys = self.data['priorcategory']

        start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body="
        sencond_urls = {
            'catelogyId': str(fid),
            'isDescription': 'true',
            'isIcon': 'true',
            'level':'1'
        }
        url = start_urls + quote(str(sencond_urls))
        #print 'url ',url
        try:
            jsons = ProcessData.get_json_data(url)
            lists = jsons['catelogyList']
        except:
            print 'error ',url
            return
        if lists == []:
            return {}
        for i in range(len(lists)):

            cid = lists[i]['cid']
#            presentcategory = []
            priorcategory = []
            priorcategory.extend(categorys)
            priorcategory.append(extract_title(lists[i]['name']))
            data = {
                'priorcategory':priorcategory,
#                'presentcategory':presentcategory
            }
            Scheduler.schedule(ThirdCrawler.type, key=cid, data=data)
Beispiel #44
0
    def crawl(self):
        # 获取key 信息
        # keyid="/%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91/b?ie=UTF8&node=106200071"
        keyid = self.key
        source = "amazon"
        score = 0  # 评分
        # 获取原始分类
        category_data = extract_category(self)
        # priorcategory
        priorcategory = self.data["priorcategory"]
        presentcategory = self.data["presentcategory"]

        count = getPageSize(self.get_url(keyid, 1))  # 页数初始值为3
        page = 1  # 从第一页开始

        content = "//div[@id='mainResults']/div"

        while page <= count:
            # 获取url信息
            url = self.get_url(keyid, page)

            # print url
            # 获取该url的流信息
            html_stream = ProcessData.get_web_data(url)

            # self.logger.info("执行页面:"+url)
            # 获取商品列表的html 信息
            html = etree.HTML(html_stream.text)

            # 获取整个商品的某一个商品的选项,返回的是一个列表
            itempath = html.xpath(content)

            if itempath != None and itempath != []:
                # print itempath
                for item in itempath:
                    title = item.xpath("h3[@class='newaps']/a")
                # crawl_data=[]  #存储数据
                # jg=item.xpath("")
                    # 价格
                    pric = item.xpath(
                        "ul[@class='rsltGridList grey']/li[@class='newp']/div")

                    if pric == None:

                        pric = item.xpath("ul/li[@class='newp']/div")

                    # 商品评分
                    socreitmem = item.xpath(
                        "ul[@class='rsltGridList grey']/li[@class='rvw']/span/span/a")

                    if socreitmem != []:
                        scoreinfo = socreitmem[0].get('alt')
                        if scoreinfo != None:
                            score = float(scoreinfo[2:-1])

                    for t in title:
                        # 获取商品的标题和url
                        original_price = u"¥0.00"

                        if pric == None or pric == []:
                            price = u"¥0.00"
                        else:
                            try:
                                price = pric[0].xpath("a/span")[0].text
                            except:
                                print url
                                print "出错价格" + pric

                        if pric != None and pric != [] and pric[0].xpath("a/del") != []:
                            # 有原价
                            original_price = pric[0].xpath("a/del")[0].text
                        else:
                            # 如果没有原价,那就可以现价一样
                            original_price = price

                # i+=1
                    # 把信息存储到mongodb中
                        data = {
                            'priorcategory': priorcategory,
                            'presentcategory': presentcategory
                        }

                        if price != None and price.strip() != '' and pric != [] and pric[0] != '':

                            # self.logger.info("价格:"+price)
                            # 把信息存储到cassandra中
                            try:
                                float(price.strip()[1:].replace(",", ""))
                                # float(original_price.strip()[1:].replace(",","")
                            except:
                                self.logger.error("错误price:" + price)
                                self.logger.error("错误price:" + original_price)

                            crawl_data = {
                                # 'id': uuid.uuid1(),
                                'source_id': t.get("href"),
                                'source': source,
                                'summary': {},
                                'title': t.xpath("span")[0].text,
                                'adword': '',
                                'price': float(price.strip()[1:].replace(",", "")),
                                'original_price': float(original_price.strip()[1:].replace(",", "")),
                                'score': 0
                            }

                            crawl_data.update(category_data)
                # 保存到cassandra数据库中category_data
                            model = EcBasicModel(crawl_data)
                            export(model)
                            data["uuid"] = model["id"]

                            # print "执行存储cassandra...."
                            Scheduler.schedule(
                                DetailCrawler.type, key=t.get("href"), data=data)
                            Scheduler.schedule(
                                CommentCrawler.type, key=t.get("href"), data=data)
                    # print repr(json.dumps(crawl_data))
            page += 1
Beispiel #45
0
    def crawl(self):

        url = "http://www.amazon.cn/gp/site-directory"
        # 获取该url的流信息
        html_stream = ProcessData.get_web_data(url)
        # 获取html 信息
        html = etree.HTML(html_stream.text)

        # 整个一级二级三级分类的xpath
        xpath = "//div[@id='siteDirectory']/div[@class='a-row']/div[@class='a-row a-spacing-small a-spacing-top-medium']"

        dom = html.xpath(xpath)

        # 获取一级分类
        onexpath = "div[@class='a-row a-spacing-extra-large a-spacing-top-small']/span/a"

        # binali
        tmp = "div[@class='a-row a-spacing-none a-spacing-top-mini sd-addPadding']/div[@class='a-column a-span3 sd-colMarginRight']"

        # 获取二级分类
        twoxpath = "div[@class='a-column a-span12 sd-columnSize']/div[@class='a-row a-spacing-small']/span[@class='sd-fontSizeL2 a-text-bold']/a"

        threexpath = "div[@class='a-column a-span12 sd-columnSize']/div[@class='a-row a-spacing-small']/div[@class='a-row']/ul/li/span/span/a"

        # 连接mongodb
        conmn = pymongo.Connection(MONGO_CONN_STR)

        for item in dom:
            # 获取一级分类  a-row a-spacing-extra-large a-spacing-top-small
            oneitem = item.xpath(onexpath)
            oneinfo = ""

            # print oneitem

            for one in oneitem:
                oneinfo += one.text + ";"

            # 获取一级分类
            oneinfo = oneinfo[:-1]
            # 把一级分类存储到mongodb中
            conmn.crawler.ecommerce.save({'priority': 1, 'status': 1, 'timeout': 3600, 'key': '', 'data':
                                          {
                                              'priorcategory': [oneinfo],
                                              'presentcategory': {"1": ''}
                                          },
                                          "interval": 0,
                                          "type": "ecommerce.amazon.firstlvl"})

            tmpxpath = item.xpath(tmp)

            for itemtmp in tmpxpath:
                twoitem = itemtmp.xpath(twoxpath)
                i = 0
                for two in twoitem:

                    conmn.crawler.ecommerce.save({'priority': 1, 'status': 1, 'timeout': 3600, 'key': two.get("href"), 'data':
                                                  {
                        'priorcategory': [oneinfo, two.text],
                        'presentcategory': {"1": '', "2": ''}
                    },
                        "interval": 0,
                        "type": "ecommerce.amazon.goodsdetail"})

                    threeitem = itemtmp.xpath(
                        "div[@class='a-column a-span12 sd-columnSize']/div[@class='a-row a-spacing-small']/div[@class='a-row']")

                    tmpc = threeitem[i].xpath("ul/li/span/span/a")
                    for t in tmpc:
                        conmn.crawler.ecommerce.save({'priority': 1, 'status': 1, 'timeout': 3600, 'key': t.get("href"), 'data':
                                                      {
                            'priorcategory': [oneinfo, two.text, t.text],
                            'presentcategory': {"1": '', "2": '', "3": ''}
                        },
                            "interval": 0,
                            "type": "ecommerce.amazon.firstlvl"})

                        # 执行列表
                        Scheduler.schedule(ListCrawler.type, key=t.get("href"), data={
                            'priorcategory': [oneinfo, two.text, t.text],
                            'presentcategory': {"1": '', "2": '', "3": ''}
                        })

                    i = i + 1
Beispiel #46
0
 def handle(self,id,priorcategory):
     data = {
         'priorcategory':priorcategory
     }
 
     Scheduler.schedule(ThirdCrawler.type, key=id, data=data)
Beispiel #47
0
 def init(conf=None):
    # pass
     Scheduler.schedule(FirstCrawler.type, interval=10800, reset=True)
Beispiel #48
0
 def handle(self,CatID,priorcategory):
     data = {
         'priorcategory':priorcategory
     }
     Scheduler.schedule(ListCrawler.type, key=CatID, data=data)
Beispiel #49
0
 def crawl(self): 
     key = str(self.key)
     data = self.data
     homepage = "http://api.weibo.cn/2/cardlist?\
                 gsid=_2A254IZdKDeTxGeRM7lUR8CnKyT2IHXVZdq2CrDV6PUJbrdAKLUf7kWptw4_No8F1OjQMCarBH4hZxZcrwA..&\
                 wm=3333_2001&i=27bd163&b=1&from=1051293010&c=iphone&v_p=18&skin=default&\
                 v_f=1&s=d2672a12&lang=zh_CN&ua=iPhone7,2__weibo__5.1.2__iphone__os8.1.3&\
                 uicode=10000198&featurecode=10000085&luicode=10000003&count=20&\
                 extparam=100103type=1&cuid=2257007621&sid=t_wap_ios&category=1&\
                 pos=1_-1&wm=3333_2001&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&\
                 fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&lfid=100103type%3D1&\
                 sourcetype=page&lcardid=user&page=1"
     # homepage = "http://api.weibo.cn/2/guest/cardlist?gsid=4wMJ47123kZuG0fKGxlRC15McKa50&uid=1001503246310&\
     #             wm=3333_2001&i=27bd163&b=0&from=1052093010&checktoken=c54259b09129d101b9669b5d93a04c0e&c=iphone&\
     #             v_p=18&skin=default&v_f=1&s=8a12fc6c&did=38d63734cc7427ebb2cb77612c1948cf&lang=zh_CN&ua=iPhone7,\
     #             2__weibo__5.2.0__iphone__os8.2&uid=1001503246310&extparam=100103\
     #             type%3D1%26q%3D%E5%8C%97%E4%BA%AC%E5%AE%89%E7%9B%91%26t%3D0%26sid%3Dt_wap_ios%26category%3D1%26pos%3D1_-1%26wm%3D3333_2001&\
     #             count=20&luicode=10000003&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&featurecode=10000085&\
     #             uicode=10000198&fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&checktoken=\
     #             c54259b09129d101b9669b5d93a04c0e&did=38d63734cc7427ebb2cb77612c1948cf&page=1"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)     
     json_stream = change_to_json(str(html_stream.text))
     cards = json_stream['cards']
     for item in cards:
         scheme = re.search(r'=(.+?)$', item.get('scheme',''))
         scheme = scheme.group(1) if scheme else ''
         url = "http://weibo.com/%s/%s?type=comment"%(data.get('id', ''),
                  scheme)
         item = item.get('mblog',{})
         item = item.get('retweeted_status',item)
         text = item.get('text','')
         title = re.search(ur'【(.+?)】', text)
         title = title.group(1) if title else ''
         if not title:
             title = re.search(ur'#(.+?)#', text)
             title = title.group(1) if title else text[0:20]+'...'
         subtitle = re.search(ur'#(.+?)#', text)           
         subtitle = subtitle.group(1) if subtitle else ''
         pubtime = item.get('created_at', '')
         pubtime = HandleContent.strformat(str(pubtime))
         reposts_count = item.get('reposts_count', '')
         comments_count = item.get('comments_count', '')
         attitudes_count = item.get('attitudes_count', '')
         thumbnail_pic = item.get('thumbnail_pic', '')
         bmiddle_pic = item.get('bmiddle_pic', '')
         original_pic = item.get('original_pic', '')
         mid = item.get('mid', '')
         author = item.get('user',{}).get('name','')
         comment = {}
         comment = {
             'reposts_count': str(reposts_count),
             'attitudes_count': str(attitudes_count),
             'comments_count': str(comments_count)
         }
         crawl_data = {}
         subtitles = []
         subtitles.append(subtitle)
         date = new_time()
         crawl_data = {
             'province': self.data.get('province',''),
             'city': self.data.get('city',''),
             'district': self.data.get('district',''),
             'url': url,
             'title': title,
             'subtitle': subtitles,
             'content': text,
             'pubtime': pubtime,
             'crtime_int': date.get('crtime_int'),
             'crtime': date.get('crtime'),
             'source': 'weibo',
             'publisher': self.data.get('publisher',''),
             'author': author,
             'origin_source': u'新浪微博',
             'type': u'微博',
             'comment': comment
         }
         model = WeiboArticleModel(crawl_data)
         if export(model):
             againt_data = {}
             againt_data = {
                 'wid': model['id'],
                 'type': u'微博',
                 'expire': date.get('crtime_int')/1000000 + 604800,
             }
             Scheduler.schedule(AgainCrawler.type, key=mid, data=againt_data,
                              reset=True, interval=21600)
         else:
             pass