コード例 #1
0
ファイル: cqn.py プロジェクト: xxguo/crawler
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div','Index_ShowDetail_Content')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='Index_ShowDetail_Title']/h1/text()"    
        xp_putime = "//div[@class='Index_ShowDetail_Time']//text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'全国',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'publisher': u'中国质量新闻网',
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': 'cqn',
            'source_type': u'中国质量新闻网',
           # 'origin_source': u'中国质量新闻网',
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #2
0
ファイル: zjbts.py プロジェクト: xxguo/crawler
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div','contaner_nr')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='contaner']/div[@class='contaner_bt']/text()"    
        xp_putime = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'浙江',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'zjbts',
            'publisher': u'浙江质监局',
            'source_type': u'质监局',
        #    'origin_source': u'浙江质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #3
0
ファイル: weibo.py プロジェクト: xxguo/crawler
 def crawl(self): 
     key = str(self.key)
     data = self.data
     homepage = "http://api.weibo.cn/2/guest/statuses_extend?v_f=2&\
     uid=1001979296366&lfid=230584&checktoken=70b77970ea4b0fb23e95549430204e44&\
     c=android&wm=2468_1001&did=006997cad1bdce0960777445e8b8fed211d91950&\
     luicode=10000228&from=1051295010&lang=zh_CN&lcardid="+key+"&\
     skin=default&i=5c7d1a1&id="+key+"&fromlog=230584&s=9bad809a&\
     gsid=4wkmda923WBtPcv1v5vMS15OcAo5U&ua=HUAWEI-HUAWEI%20T8950__weibo__5.1.2__android__android4.0.4&\
     oldwm=2468_1001&is_recom=-1&uicode=10000002"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)    
     json_stream = change_to_json(str(html_stream.text))
     crawl_data = {}
     reposts_count = json_stream.get('reposts_count', 0)
     comments_count = json_stream.get('comments_count', 0)
     attitudes_count = json_stream.get('attitudes_count', 0)
     date = new_time()
     crawl_data = {
         'id': data.get('wid'),
         'reposts': reposts_count,
         'comments': comments_count,
         'likes': attitudes_count,
         'type': data.get('type'),
         'crtime_int': date.get('crtime_int'),
         'expire': data.get('expire')
     }
     model = WeiboHotModel(crawl_data)
     export(model)
コード例 #4
0
ファイル: jd.py プロジェクト: xxguo/crawler
    def crawl(self):
        # fid = '1662'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        fid = self.key
        category_data = extract_category(self)

        count = 3 #页数初始值为3
        pages = 1 #从第一页开始

        while pages <= count:
            url = self.get_url(fid,pages)
            try:
                jsons = ProcessData.get_json_data(url)
                if pages==1 : count = math.ceil(int(jsons['wareCount'])/100)
                lists = jsons['wareInfo']
            except Exception,e:
                self.logger.error(url)
                self.logger.error(e)
                print 'error ',url
                return
            if lists == []:
                return {}
            for i in range(len(lists)):
                ids = uuid.uuid1() #cassandra 主键
                wareId = lists[i]['wareId']

                try:
                    f = lambda x: int(x[:-1])/100.00
                    ecsumscores = float(f(lists[i]['good'])) #商品总评分
                except:
                    ecsumscores = 0

                crawl_data = {
                    # 'id': uuid.uuid1(),
                    'source_id': wareId,
                    'source': self.data.get('source'),
                    'summary': {},
                    'title': lists[i]['wname'],
                    'adword': lists[i]['adword'],
                    'price': float(lists[i]['jdPrice']),
                    'original_price': float(lists[i]['martPrice']),
                    'score': ecsumscores
                }
                crawl_data.update(category_data)
                data = {
                    # 'uuid': ids,
                    'priorcategory': self.data['priorcategory'],
                    'presentcategory': self.data['priorcategory']
#                    'presentcategory': self.data['presentcategory']
                }

                model = EcBasicModel(crawl_data)
                export(model)
                data["uuid"] = model["id"]
                Scheduler.schedule(DetailCrawler.type, key=wareId, data=data)
                Scheduler.schedule(CommentCrawler.type, key=wareId, data=data)


            pages += 1
コード例 #5
0
ファイル: yhd.py プロジェクト: xxguo/crawler
    def crawler_data(self,tree):
        category_data = extract_category(self)

        XPATH = self.search_list_xpath
        if len(tree.xpath(XPATH('list'))) == 0:
            XPATH = self.product_list_xpath
        dom = tree.xpath(XPATH('list'))
        for item in dom:
            crawl_data = {}
            craw = [
                'title','adword',
                'price','original_price',
                'source_id','score',
            ]

            for value in craw: 
                crawl_data[value] = self.mackining(item.xpath(XPATH(value)))
            crawl_data['price'] = float(crawl_data['price'])
            try:
                f = lambda x: int(x[:-1])/100.00
                crawl_data['score'] = float(f(crawl_data['score']))
            except:
                crawl_data['score'] = 0
            crawl_data.update(category_data)
            crawl_data['source'] = 'yhd'
            model = EcBasicModel(crawl_data)
            export(model)
            data = {
                'priorcategory': self.data['priorcategory'],
                'presentcategory': self.data['priorcategory']           
            }            
            data["uuid"] = model["id"]
            Scheduler.schedule(DetailCrawler.type, key=str(self.key), data=data)
コード例 #6
0
ファイル: jd.py プロジェクト: xxguo/crawler
    def crawl(self):
        # wareId = '1229271'
        # wareId = '1391817787'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        # ids = uuid.uuid1()


        wareId = self.key
        ids =  self.data.get('uuid')
        category_data = extract_category(self)

        url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId))
        html_stream = ProcessData.get_web_data(url)
        tree = etree.HTML(html_stream.text)
        xpath = "//table[@class='Ptable']/tr/td/text()"
        dom = tree.xpath(xpath)
        specifications = {}
        temporary = ''
        i = 0
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            if i%2 ==0:
                specifications[item] = ''
                temporary = extract_title(item)
            else:
                specifications[temporary] = extract_text(item)

            i += 1

        data = {
            'ecnorms':specifications
        }
        # specifications = json.dumps(specifications, ensure_ascii=False)
        introduce = IntroduceCrawler.crawl(wareId,ids)
        ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else ''
   #     ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else ''
        ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else ''
        crawl_data = {
            'id': ids,
            'source': self.data.get('source'),
            'source_id': wareId,
            'summary': specifications,
            'introduce': introduce,
            'name': ecnames,
            'brand': ecbrands
        }
        crawl_data.update(category_data)
        model = EcDetailModel(crawl_data)
        export(model)
コード例 #7
0
ファイル: sogou.py プロジェクト: xxguo/crawler
    def crawl(self): 
        homepage = self.key
        data = self.data
        html_stream = _get_url(homepage)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div',class_=['rich_media_content',\
                                'rich_media_thumb_wrp'])
        xp_title = "//div[@class='rich_media_area_primary']/\
                    h2[@class='rich_media_title']/text()"
        xp_putime = "//div/em[@class='rich_media_meta rich_media_meta_text']\
                    /text()"
        xp_author = "//div/em[@class='rich_media_meta rich_media_meta_text'][2]/text()"
        xp_publisher = "//div/a[@id='post-user']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author)
        publisher = HandleContent.get_author(html_stream, xpath=xp_publisher)
        comment = {}
        # con = lambda x, y: x.text.replace('\n','').replace('\r','') + \
        #                     y.text.replace('\n','').replace('\r','')
        # comment['content'] = reduce(con,content)

        content = clear_label(content, root=homepage)
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        date = new_time()
        crawl_data = {}
        crawl_data = {
            'province': self.data.get('province',''),
            'city': self.data.get('city',''),
            'district': self.data.get('district',''),
            'url': homepage,
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': 'sogou',
            'author': author,
            'publisher': self.data.get('publisher', publisher),
            'origin_source': u'微信公共账号',
            'type': u'微信',
            'comment': comment
        }
        if data.get('key'):
            crawl_data.update(data)
            model = SearchArticleModel(crawl_data)
        else:
            model = WeixinArticleModel(crawl_data)

        export(model)
コード例 #8
0
ファイル: weibo.py プロジェクト: xxguo/crawler
 def crawl(self):
     key = self.key
     data = self.data
     homepage = "http://card.weibo.com/article/aj/articleshow?cid="+ key
     url = "http://weibo.com/p/"+ key
     html_stream = _get_url(homepage)    
     json_stream = change_to_json(str(html_stream.text))
     html_stream = json_stream['data']['article']
     soup = HandleContent.get_BScontext(html_stream, text=True)
     title = soup.select('.title')[0].text
     pubtime = soup.select('.time')[0].text
     pubtime = HandleContent.strformat(str(pubtime))
     content = soup.select('.WBA_content')[0]
     content = clear_label(list(content))
     comment = {}
     text = HandleContent.get_BScontext(content, text=True).text
     comment['content'] = clear_space(text)
     publishers = soup.select('.S_link2')
     # author = reduce(lambda x, y: x + y, [item.text for item in authors])
     try:
         publisher = publishers[1].text if len(publishers)> 1 else publishers[0].text
     except:
         publisher = ''
     crawl_data = {}
     date = new_time()
     crawl_data = {
         'title': title,
         'pubtime': pubtime,
         'source': 'weibo',
         'publisher': publisher,
         'crtime_int': date.get('crtime_int'),
         'crtime': date.get('crtime'),
         'origin_source': u'微博搜索',
         'url': url,
         'key': data.get('key', ''),
         'type': u'元搜索',
         'source_type': data.get('source_type', ''),
         'content': content,
         'comment': comment,
     }
     model = SearchArticleModel(crawl_data)
     export(model)
コード例 #9
0
ファイル: yhd.py プロジェクト: xxguo/crawler
    def crawler_data(self,tree):
        ids =  self.data.get('uuid')
        category_data = extract_category(self)
        introduce = tree.xpath(self.ware_xpath('introduce'))
        specifications = tree.xpath(self.ware_xpath('specifications'))
        introd = {}
        ecnorms = {}
        for item in introduce:
            item = item.strip()
            if item == '': continue
            item = item.split(u':',1)
            try:
                introd[item[0]] = item[1]
            except:
                pass
        for item in specifications:
            label = item.xpath(self.ware_xpath('label'))
            names = []
            values = []
            for i in label:
                i = i.strip()
                if i.strip() == '':  continue
                names.append(i)
            dd = item.xpath(self.ware_xpath('item'))
            for i in dd:
                i = i.strip()
                if i.strip() == '':  continue        
                values.append(i)
            ecnorms.update(map(lambda x,y:[x,y],names,values))

        crawl_data = {
            'id': ids,
            'source': self.data.get('source'),
            'source_id': str(self.key),
            'summary': ecnorms,
            'introduce': introd,
            'version': ecnorms.get(u'型号',''),
            'brand': ecnorms.get(u'商品品牌','')
        }
        crawl_data.update(category_data)
        model = EcDetailModel(crawl_data)
        export(model)
コード例 #10
0
ファイル: jd.py プロジェクト: xxguo/crawler
    def crawl(self):
        # wareId = '1229271'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        # ecid = '124'
        wareId = self.key
        ecid =  self.data['uuid']
        category_data = extract_category(self)
        pages = 1
        count = True
        while count: 
            number = 0    #去重
            url = self.get_url(wareId,pages)
            # print '++++++++= ',url
            html_stream = ProcessData.get_web_data(url)
            try:
                tree = etree.HTML(html_stream.text)
            except:
                print 'error: ',url
                break
            xpath = "//div[@id='comments-list']/div[@class='mc']"
            dom = tree.xpath(xpath)
            if dom == []:
                count = False
                continue
            for item in dom:
                datas = self.handle(item)
                comment_data={
                    # 'uuid': uuid.uuid1(),         #primary key
                    'ecid': ecid,        #commodity table foreign key
                    'source_id': wareId,
                    'source': self.data.get('source'),
                    'comment_id': datas['commentid'],  #review id
                    'score': datas['score'],         #commodity score
                    'pubtime': datas['commenttime'],
                    'buytime': datas['buytime'],
                    'user_id': datas['url'],
                    # 'usernickName': groups[i]['usernickName'],
                    'useful': datas['useful'],
                    'reply': datas['reply'],
                    'content': datas['comment'],
                    'province': datas['province']

                }
                comment_data.update(category_data)
                model = EcCommentModel(comment_data)
                is_saved = export(model)
                if is_saved == True:
                    pass
                else:
                    number += 1
            if number > 10:
                break
            pages += 1
コード例 #11
0
ファイル: allnews.py プロジェクト: xxguo/crawler
 def crawl(self): 
     data = self.data
     url = self.key
     html_stream = _get_url(url)
     title = get_titles(html_stream)
     pubtime = local2utc(get_publish_times(html_stream))
     soup = Readability(html_stream.text, url)
     content = soup.content
     # soup = HandleContent.get_BScontext(html_stream)
     comment = {}
     try:
         text = HandleContent.get_BScontext(content, text=True).text
         comment['content'] = clear_space(text)
     except:
         content = ''
         pass
    # comment['key'] = data.get('key','')
     # comment['count'] = data.get('count','')
     crawl_data = {}
     date = new_time()
     crawl_data = {
         'url': url,
         'province': data.get('province'),
         'city': data.get('city', u''),
         'district': data.get('district', u''),
         'title': title,
         'content': content,
         'pubtime': pubtime,
         'crtime_int': date.get('crtime_int'),
         'crtime': date.get('crtime'),
         'source': u'yuqing',
         'publisher': data.get('publisher', u''),
         'source_type': data.get('type'),
       #  'origin_source': u'福建质监局',
       #  'author': author,
         'type': u'文章',
         'comment': comment,
     }
     if comment['content']:
         model = ZjldArticleModel(crawl_data)
         export(model)
コード例 #12
0
ファイル: amazon.py プロジェクト: xxguo/crawler
    def crawlHtml(self, html):

        ids = self.data['uuid']
        source = "amazon"
        source_id = self.key
        category_data = extract_category(self)
        summary = {}
        ecbrands = ""
        ecnames = ""
        introduce = {}
        # 获取  productDetailsTable
        prodDetails = html.xpath(
            "//table[@id='productDetailsTable']//tr/td[@class='bucket']/div[@class='content']/ul/li")

        for proditem in prodDetails:

            k = proditem.xpath("b/text()")[0].strip()[:-1]

            if k == "用户评分":
                summary[k] = proditem.xpath(
                    "span[@class='crAvgStars']/span/a/span/span/text()")[0].strip()[2:-1]
                # print
            elif k == "亚马逊热销商品排名":
                print "a"
            else:
                summary[k] = proditem.xpath("text()")[0].strip()

        crawl_data = {
            'id': ids,
            'source': source,
            'source_id': source_id,
            'summary': summary,
            'introduce': introduce,
            'name': ecnames,
            'brand': ecbrands
        }
        crawl_data.update(category_data)
        # print crawl_data
        model = EcDetailModel(crawl_data)
        export(model)
コード例 #13
0
ファイル: fsjsjd.py プロジェクト: xxguo/crawler
    def crawl(self):

        url = self.key
        data = self.data
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div',id='right-text_d')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@id='right-title_d']//text()"    
        xp_putime = "//div[@class='article']/p[@class='info']/span/text()"
      #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
      #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'广东',
            'city': u'佛山',
            'title': title,
            'content': content,
            'pubtime': data.get('pubtime', pubtime),
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'fsjsjd',
            'publisher': u'广东佛山质监局',
            'source_type': u'质监局',
          #  'origin_source': u'福建质监局',
          #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print title.encode('utf-8'), '---',crawl_data['pubtime']
        # print comment['content'].encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #14
0
ファイル: gzq.py プロジェクト: xxguo/crawler
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('td',id='td_news_content')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td[@class='content-title']/div/text()"    
        xp_putime = "//tr/td[@class='bottom-line-gray']/text()"
      #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
      #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'广东',
            'city': u'广州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'gzq',
            'publisher': u'广东广州质监局',
            'source_type': u'质监局',
          #  'origin_source': u'福建质监局',
          #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print title.encode('utf-8'), pubtime
        # print comment['content'].encode('utf-8')
        # print content.encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #15
0
ファイル: hbzljd.py プロジェクト: xxguo/crawler
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div',['article-box','files'])
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='article']/h2/text()|//h3/text()"    
        xp_putime = "//div[@class='article']/p[@class='info']/span/text()"
      #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
      #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'湖北',
         #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'hbzljd',
            'publisher': u'湖北质监局',
            'source_type': u'质监局',
          #  'origin_source': u'福建质监局',
          #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print title.encode('utf-8'), pubtime
        # print comment['content'].encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #16
0
ファイル: newegg.py プロジェクト: jshliu/crawler
 def crawl(self):
     CatID = self.key
     category_data = extract_category(self)
     page = 1
     page_count = 1
     while page <= page_count:
         jsons = self.get_response(CatID, page)
         if page == 1: page_count = self.get_page_count(jsons)
         for goods in jsons['ProductListItems']:
             source_id = goods["Code"]
             task_data = self.has_goods(source_id)
             if task_data:
                 crawl_data = {
                     "id": task_data["uuid"],
                     "title": goods["Title"],
                     "price": goods["Price"]["CurrentPrice"],
                     "source_id": source_id,
                     "source": self.data["source"],
                     "status": task_data["status"],
                     "brand": task_data["brand"],
                     "version": task_data["version"],
                     "series": task_data["series"],
                     "comment": {
                         "is_Bbc": task_data["isBbc"],
                     },
                 }
                 crawl_data.update(category_data)
                 crawl_data.update(get_ctime())
                 model = EcBasicModel(crawl_data)
                 export(model)
             else:
                 detail_data = {
                     "priorcategory": self.data["priorcategory"],
                 }
                 Scheduler.schedule(DetailCrawler.type,
                                    key=source_id,
                                    data=detail_data)
         page += 1
コード例 #17
0
 def crawl(self):
     ecid = self.data['uuid']
     goodsNo = str(self.key)
     category_data = extract_category(self)
     totalpage = int(self.get_page(goodsNo))
     if totalpage == 0:
         print "get_page fail"
         return {}
     for i in range(totalpage):
         url = self.get_url(goodsNo, i)
         json = ProcessData.get_json_data(url)
         try:
             appraise = json['appraiseArray']
         except Exception, e:
             self.logger.error(url)
             self.logger.error(e)
             print "get appraise fail"
         for item in appraise:
             commentid = item['id']
             summary = item['summary']
             score = item['appraiseGrade']
             userorderid = item['appraiseName']
             commenttime = ProcessData.str_datetime(item['appraiseTime'])
             # print commentid
             # print summary.encode('utf-8')
             comment_data = {
                 'ecid': ecid,  #commodity table foreign key
                 'source_id': goodsNo,
                 'source': self.data.get('source'),
                 'comment_id': item['id'],  #review id
                 'score': item['appraiseGrade'],  #commodity score
                 'pubtime': ProcessData.str_datetime(item['appraiseTime']),
                 'user_id': item['appraiseName'],
                 'content': item['summary']
             }
             comment_data.update(category_data)
             model = EcCommentModel(comment_data)
             export(model)
コード例 #18
0
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('td', 'conzt')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()"
        xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()"
        xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream,
                                          xpath=xp_author,
                                          xp_text=u'来源:')
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'山东',
            #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'sdqts',
            'publisher': u'山东质监局',
            'source_type': u'质监局',
            # 'origin_source': u'山东质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #19
0
 def crawl(self):
     data = self.data
     url = self.key
     html_stream = _get_url(url)
     title = get_titles(html_stream)
     pubtime = local2utc(get_publish_times(html_stream))
     soup = Readability(html_stream.text, url)
     content = soup.content
     # soup = HandleContent.get_BScontext(html_stream)
     comment = {}
     try:
         text = HandleContent.get_BScontext(content, text=True).text
         comment['content'] = clear_space(text)
     except:
         content = ''
         pass
    # comment['key'] = data.get('key','')
     # comment['count'] = data.get('count','')
     crawl_data = {}
     date = new_time()
     crawl_data = {
         'url': url,
         'province': data.get('province'),
         'city': data.get('city', u''),
         'district': data.get('district', u''),
         'title': title,
         'content': content,
         'pubtime': pubtime,
         'crtime_int': date.get('crtime_int'),
         'crtime': date.get('crtime'),
         'source': self.data["source"],
         'publisher': data.get('publisher', u''),
         'source_type': data.get('type'),
         'comment': comment,
     }
     if comment['content']:
         model = ZjldArticleModel(crawl_data)
         export(model)
コード例 #20
0
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div', ['article-box', 'files'])
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='article']/h2/text()|//h3/text()"
        xp_putime = "//div[@class='article']/p[@class='info']/span/text()"
        #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'湖北',
            #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'hbzljd',
            'publisher': u'湖北质监局',
            'source_type': u'质监局',
            #  'origin_source': u'福建质监局',
            #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print title.encode('utf-8'), pubtime
        # print comment['content'].encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #21
0
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div', 'Custom_UnionStyle')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@id='cTitle']/text()"
        xp_putime = "//tr/td[@align='center']/text()"
        #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'广东',
            #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'gdqts',
            'publisher': u'广东质监局',
            'source_type': u'质监局',
            #  'origin_source': u'福建质监局',
            #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print title.encode('utf-8'), pubtime
        # print content.encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #22
0
ファイル: aqsiq.py プロジェクト: xxguo/crawler
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div','TRS_Editor')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td[@align='center']/h1/text()"    
        xp_putime = "//div[@class='xj2']/text()"
      #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
      #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'全国',
         #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'aqsiq',
            'publisher': u'国家质量监督检验检疫总局',
            'source_type': u'国家质量监督检验检疫总局',
          #  'origin_source': u'福建质监局',
          #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print '===',pubtime,title.encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #23
0
 def crawl(self):
     category_data = extract_category(self)
     page_size = self.get_page_size(self.key)
     page = 1
     while page <= page_size:
         json_data = ProcessData.get_json_data(self.get_url(self.key, page))
         reviews = json_data.get("commodityReviews", [])
         if not reviews:
             return
         for review in reviews:
             crawl_data = {
                 "comment_id": self.get_comment_id(review),
                 "content": review["content"],
                 "tags": self.get_tags(review),
                 "show_pic": self.get_show_pic(review),
                 "pubtime": self.get_pubtime(review),
                 "score": float(review["qualityStar"]),
                 "useful": int(review["usefulCnt"]),
                 "reply": 1 if review.get("replyInfo", {}) else 0,
                 "user_name": review.get("userInfo",
                                         {}).get("nickName", ""),
                 "eid": self.data["uuid"],
                 "brand": self.data["brand"],
                 "version": self.data["version"],
                 "series": self.data["series"],
                 "source": self.data["source"],
                 "source_id": self.key,
                 "status": self.data["status"],
                 "comment": {
                     "is_Bbc": self.data["is_Bbc"],
                 },
             }
             crawl_data.update(category_data)
             crawl_data.update(get_ctime())
             model = EcCommentModel(crawl_data)
             export(model)
         page += 1
コード例 #24
0
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('span', 'ny')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td[@class='dhz']/span/text()"
        xp_putime = "//td/table/tbody/tr/td[@align='center']/span/text()"
        # xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        # author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'浙江',
            'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'hzqts',
            'publisher': u'浙江杭州质监局',
            'source_type': u'质监局',
            #   'origin_source': u'浙江杭州质监局',
            #    'author': author,
            'type': u'文章',
            'comment': comment,
        }
        #   print content.encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #25
0
ファイル: hzqts.py プロジェクト: xxguo/crawler
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('span','ny')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td[@class='dhz']/span/text()"    
        xp_putime = "//td/table/tbody/tr/td[@align='center']/span/text()"
       # xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
       # author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'浙江',
            'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'hzqts',
            'publisher': u'浙江杭州质监局',
            'source_type': u'质监局',
         #   'origin_source': u'浙江杭州质监局',
        #    'author': author,
            'type': u'文章',
            'comment': comment,
        }
     #   print content.encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #26
0
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('li','show_con')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//ul/li[@class='show_title']/text()"    
        xp_putime = "//ul/li[@class='show_date']/text()"
        xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'江西',
         #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'jxzj',
            'publisher': u'江西质监局',
            'source_type': u'质监局',
        #    'origin_source': u'江西质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #27
0
ファイル: sdqts.py プロジェクト: xxguo/crawler
    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('td','conzt')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()"    
        xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()"
        xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author, xp_text=u'来源:')
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'山东',
         #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'sdqts',
            'publisher': u'山东质监局',
            'source_type': u'质监局',
           # 'origin_source': u'山东质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)
コード例 #28
0
ファイル: amazon.py プロジェクト: xxguo/crawler
    def crawl(self):

        # id号
        ids = self.data['uuid']
        # ids="1dcfa11e-7acf-11e4-b0cc-00e06668ddd1"
        # source_id=""
        # 商品url信息
        url = self.key

        print "url:" + url

        source = "amazon"

        category_data = extract_category(self)

        # 获取该url的流信息
        html_stream = ProcessData.get_web_data(url)

        # 获取商品列表的html 信息
        html = etree.HTML(html_stream.text)

        # 获取商品的详细信息
        prodDetails = html.xpath("//div[@id='prodDetails']")

        if len(prodDetails) == 0:
            # 获取模版也具有基本信息的数据
            detailed = getDetailedGoods(
                type=self.type,
                key=self.key,
                data=self.data
            ).crawlHtml(html)
        else:
            # 打印商品样式
            style = prodDetails[0].xpath("div[@class='disclaim']/strong")
           # print style[0].text

        # 获取具体商品信息
            goodinfo = prodDetails[0].xpath(
                "div[@class='wrapper CNlocale']//table/tbody/tr")

        # 商品
            summary = {}
            ecbrands = ""
            ecnames = ""
            introduce = {}

            for info in goodinfo:
                # print
                # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text
                if info.xpath("td[@class='label']") != []:
                    if info.xpath("td[@class='label']")[0].text == "用户评分":
                        summary[info.xpath("td[@class='label']")[0].text] = info.xpath("td[@class='value']")[
                            0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1]
                    # print
                    # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1]
                    elif info.xpath("td[@class='label']")[0].text.strip() == "品牌":
                        ecbrands = info.xpath(
                            "td[@class='value']")[0].text.strip()
                    else:
                        summary[info.xpath("td[@class='label']")[0].text] = info.xpath(
                            "td[@class='value']")[0].text.strip()
                    # print
                    # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text.strip()

                    # 存入cassandra中
            crawl_data = {
                'id': ids,
                'source': source,
                'source_id': url,
                'summary': summary,
                'introduce': introduce,
                'name': ecnames,
                'brand': ecbrands
            }

            crawl_data.update(category_data)
            # print crawl_data
            model = EcDetailModel(crawl_data)
            export(model)
コード例 #29
0
ファイル: amazon.py プロジェクト: xxguo/crawler
    def crawl(self):
        # 获取key 信息
        # keyid="/%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91/b?ie=UTF8&node=106200071"
        keyid = self.key
        source = "amazon"
        score = 0  # 评分
        # 获取原始分类
        category_data = extract_category(self)
        # priorcategory
        priorcategory = self.data["priorcategory"]
        presentcategory = self.data["presentcategory"]

        count = getPageSize(self.get_url(keyid, 1))  # 页数初始值为3
        page = 1  # 从第一页开始

        content = "//div[@id='mainResults']/div"

        while page <= count:
            # 获取url信息
            url = self.get_url(keyid, page)

            # print url
            # 获取该url的流信息
            html_stream = ProcessData.get_web_data(url)

            # self.logger.info("执行页面:"+url)
            # 获取商品列表的html 信息
            html = etree.HTML(html_stream.text)

            # 获取整个商品的某一个商品的选项,返回的是一个列表
            itempath = html.xpath(content)

            if itempath != None and itempath != []:
                # print itempath
                for item in itempath:
                    title = item.xpath("h3[@class='newaps']/a")
                # crawl_data=[]  #存储数据
                # jg=item.xpath("")
                    # 价格
                    pric = item.xpath(
                        "ul[@class='rsltGridList grey']/li[@class='newp']/div")

                    if pric == None:

                        pric = item.xpath("ul/li[@class='newp']/div")

                    # 商品评分
                    socreitmem = item.xpath(
                        "ul[@class='rsltGridList grey']/li[@class='rvw']/span/span/a")

                    if socreitmem != []:
                        scoreinfo = socreitmem[0].get('alt')
                        if scoreinfo != None:
                            score = float(scoreinfo[2:-1])

                    for t in title:
                        # 获取商品的标题和url
                        original_price = u"¥0.00"

                        if pric == None or pric == []:
                            price = u"¥0.00"
                        else:
                            try:
                                price = pric[0].xpath("a/span")[0].text
                            except:
                                print url
                                print "出错价格" + pric

                        if pric != None and pric != [] and pric[0].xpath("a/del") != []:
                            # 有原价
                            original_price = pric[0].xpath("a/del")[0].text
                        else:
                            # 如果没有原价,那就可以现价一样
                            original_price = price

                # i+=1
                    # 把信息存储到mongodb中
                        data = {
                            'priorcategory': priorcategory,
                            'presentcategory': presentcategory
                        }

                        if price != None and price.strip() != '' and pric != [] and pric[0] != '':

                            # self.logger.info("价格:"+price)
                            # 把信息存储到cassandra中
                            try:
                                float(price.strip()[1:].replace(",", ""))
                                # float(original_price.strip()[1:].replace(",","")
                            except:
                                self.logger.error("错误price:" + price)
                                self.logger.error("错误price:" + original_price)

                            crawl_data = {
                                # 'id': uuid.uuid1(),
                                'source_id': t.get("href"),
                                'source': source,
                                'summary': {},
                                'title': t.xpath("span")[0].text,
                                'adword': '',
                                'price': float(price.strip()[1:].replace(",", "")),
                                'original_price': float(original_price.strip()[1:].replace(",", "")),
                                'score': 0
                            }

                            crawl_data.update(category_data)
                # 保存到cassandra数据库中category_data
                            model = EcBasicModel(crawl_data)
                            export(model)
                            data["uuid"] = model["id"]

                            # print "执行存储cassandra...."
                            Scheduler.schedule(
                                DetailCrawler.type, key=t.get("href"), data=data)
                            Scheduler.schedule(
                                CommentCrawler.type, key=t.get("href"), data=data)
                    # print repr(json.dumps(crawl_data))
            page += 1
コード例 #30
0
    def crawl(self):

        #商品id, 需要获取
        goodid = self.data['uuid']
        # goodid="7ebd0a6a-7b5c-11e4-85d7-00e06668ddd1"
        source = "amazon"

        url = self.key
        source_id = url
        category_data = extract_category(self)

        count = getCommSize(self.get_url(url, 1))  # 页数初始值为3
        page = 1  # 从第一页开始

        while page <= count:
            newurl = self.get_url(url, page)
            print newurl
            # productReviews

            # 获取该url的流信息
            html_stream = ProcessData.get_web_data(newurl)

            # 获取商品列表的html 信息
            html = etree.HTML(html_stream.text)
            # 获取评论区
            comment = html.xpath("//table[@id='productReviews']//tr/td/div")

            for comitem in comment:
                # None

                # 评论内容
                item = comitem.xpath("div[@class='reviewText']//text()")

                # 评分
                scoreitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/span/span/span")
                # 发布时间
                pubtimeitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/span[@style='vertical-align:middle;']/nobr"
                )

                # 用户的链接地址
                user_iditem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/div/div[@style='float:left;']/a"
                )

                # 有用信息
                usefulitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']")

                oninfo = ""
                for i in item:
                    oninfo += i

                # 有用和无用信息
                if usefulitem != None and usefulitem != []:
                    tmpuseful = usefulitem[0].text.strip()
                else:
                    tmpuseful = "0"

                if tmpuseful == "":
                    tmpuseful = "0"
                elif tmpuseful != "0":
                    tmpuseful = tmpuseful[0:tmpuseful.index("/")]

                # 日期
                pubtim = datetime.strptime("1971-01-1", '%Y-%m-%d')
                if pubtimeitem != None and pubtimeitem != []:
                    pubtim = datetime.strptime(
                        pubtimeitem[0].text.replace("年", "-").replace(
                            "月", "-").replace("日", ""), '%Y-%m-%d')

                # 把日期的字符串类型,转换成日期类型

                sorce = "0.0"

                if scoreitem != None and scoreitem != []:
                    sorce = scoreitem[0].text[2:-1].strip()
                    # print "评分:"+sorce

            #  print user_iditem
                userid = ''
                if user_iditem != None and user_iditem != []:
                    userid = str(user_iditem[0].get("href"))

                comment_data = {
                    "ecid": goodid,
                    "source_id": source_id,
                    "source": source,
                    "comment_id": "",
                    "pubtime": pubtim,
                    "buytime": pubtim,
                    "score": float(sorce),
                    "user_id": userid,
                    "useful": int(tmpuseful),
                    'reply': 0,
                    "content": oninfo.strip()
                }
                #                print comment_data
                # 把原始和现有分类存储到数据库中
                comment_data.update(category_data)

                model = EcCommentModel(comment_data)
                export(model)
            page += 1
コード例 #31
0
    def crawl(self):
        key = str(self.key)
        data = self.data
        homepage = "http://api.weibo.cn/2/cardlist?\
                    gsid=_2A254IZdKDeTxGeRM7lUR8CnKyT2IHXVZdq2CrDV6PUJbrdAKLUf7kWptw4_No8F1OjQMCarBH4hZxZcrwA..&\
                    wm=3333_2001&i=27bd163&b=1&from=1051293010&c=iphone&v_p=18&skin=default&\
                    v_f=1&s=d2672a12&lang=zh_CN&ua=iPhone7,2__weibo__5.1.2__iphone__os8.1.3&\
                    uicode=10000198&featurecode=10000085&luicode=10000003&count=20&\
                    extparam=100103type=1&cuid=2257007621&sid=t_wap_ios&category=1&\
                    pos=1_-1&wm=3333_2001&containerid=" + key + "_-_WEIBO_SECOND_PROFILE_WEIBO&\
                    fid=" + key + "_-_WEIBO_SECOND_PROFILE_WEIBO&lfid=100103type%3D1&\
                    sourcetype=page&lcardid=user&page=1"

        # homepage = "http://api.weibo.cn/2/guest/cardlist?gsid=4wMJ47123kZuG0fKGxlRC15McKa50&uid=1001503246310&\
        #             wm=3333_2001&i=27bd163&b=0&from=1052093010&checktoken=c54259b09129d101b9669b5d93a04c0e&c=iphone&\
        #             v_p=18&skin=default&v_f=1&s=8a12fc6c&did=38d63734cc7427ebb2cb77612c1948cf&lang=zh_CN&ua=iPhone7,\
        #             2__weibo__5.2.0__iphone__os8.2&uid=1001503246310&extparam=100103\
        #             type%3D1%26q%3D%E5%8C%97%E4%BA%AC%E5%AE%89%E7%9B%91%26t%3D0%26sid%3Dt_wap_ios%26category%3D1%26pos%3D1_-1%26wm%3D3333_2001&\
        #             count=20&luicode=10000003&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&featurecode=10000085&\
        #             uicode=10000198&fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&checktoken=\
        #             c54259b09129d101b9669b5d93a04c0e&did=38d63734cc7427ebb2cb77612c1948cf&page=1"
        homepage = clear_space(homepage)
        html_stream = _get_url(homepage)
        json_stream = change_to_json(str(html_stream.text))
        cards = json_stream['cards']
        for item in cards:
            scheme = re.search(r'=(.+?)$', item.get('scheme', ''))
            scheme = scheme.group(1) if scheme else ''
            url = "http://weibo.com/%s/%s?type=comment" % (data.get(
                'id', ''), scheme)
            item = item.get('mblog', {})
            item = item.get('retweeted_status', item)
            text = item.get('text', '')
            title = re.search(ur'【(.+?)】', text)
            title = title.group(1) if title else ''
            if not title:
                title = re.search(ur'#(.+?)#', text)
                title = title.group(1) if title else text[0:20] + '...'
            subtitle = re.search(ur'#(.+?)#', text)
            subtitle = subtitle.group(1) if subtitle else ''
            pubtime = item.get('created_at', '')
            pubtime = HandleContent.strformat(str(pubtime))
            reposts_count = item.get('reposts_count', '')
            comments_count = item.get('comments_count', '')
            attitudes_count = item.get('attitudes_count', '')
            thumbnail_pic = item.get('thumbnail_pic', '')
            bmiddle_pic = item.get('bmiddle_pic', '')
            original_pic = item.get('original_pic', '')
            mid = item.get('mid', '')
            author = item.get('user', {}).get('name', '')
            comment = {}
            comment = {
                'reposts_count': str(reposts_count),
                'attitudes_count': str(attitudes_count),
                'comments_count': str(comments_count)
            }
            crawl_data = {}
            subtitles = []
            subtitles.append(subtitle)
            date = new_time()
            crawl_data = {
                'province': self.data.get('province', ''),
                'city': self.data.get('city', ''),
                'district': self.data.get('district', ''),
                'url': url,
                'title': title,
                'subtitle': subtitles,
                'content': text,
                'pubtime': pubtime,
                'crtime_int': date.get('crtime_int'),
                'crtime': date.get('crtime'),
                'source': 'weibo',
                'publisher': self.data.get('publisher', ''),
                'author': author,
                'origin_source': u'新浪微博',
                'type': u'微博',
                'comment': comment
            }
            model = WeiboArticleModel(crawl_data)
            if export(model):
                againt_data = {}
                againt_data = {
                    'wid': model['id'],
                    'type': u'微博',
                    'expire': date.get('crtime_int') / 1000000 + 604800,
                }
                Scheduler.schedule(AgainCrawler.type,
                                   key=mid,
                                   data=againt_data,
                                   reset=True,
                                   interval=21600)
            else:
                pass
コード例 #32
0
    def crawl(self):
        # 获取key 信息
        # keyid="/%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91/b?ie=UTF8&node=106200071"
        keyid = self.key
        source = "amazon"
        score = 0  # 评分
        # 获取原始分类
        category_data = extract_category(self)
        # priorcategory
        priorcategory = self.data["priorcategory"]
        presentcategory = self.data["presentcategory"]

        count = getPageSize(self.get_url(keyid, 1))  # 页数初始值为3
        page = 1  # 从第一页开始

        content = "//div[@id='mainResults']/div"

        while page <= count:
            # 获取url信息
            url = self.get_url(keyid, page)

            # print url
            # 获取该url的流信息
            html_stream = ProcessData.get_web_data(url)

            # self.logger.info("执行页面:"+url)
            # 获取商品列表的html 信息
            html = etree.HTML(html_stream.text)

            # 获取整个商品的某一个商品的选项,返回的是一个列表
            itempath = html.xpath(content)

            if itempath != None and itempath != []:
                # print itempath
                for item in itempath:
                    title = item.xpath("h3[@class='newaps']/a")
                    # crawl_data=[]  #存储数据
                    # jg=item.xpath("")
                    # 价格
                    pric = item.xpath(
                        "ul[@class='rsltGridList grey']/li[@class='newp']/div")

                    if pric == None:

                        pric = item.xpath("ul/li[@class='newp']/div")

                    # 商品评分
                    socreitmem = item.xpath(
                        "ul[@class='rsltGridList grey']/li[@class='rvw']/span/span/a"
                    )

                    if socreitmem != []:
                        scoreinfo = socreitmem[0].get('alt')
                        if scoreinfo != None:
                            score = float(scoreinfo[2:-1])

                    for t in title:
                        # 获取商品的标题和url
                        original_price = u"¥0.00"

                        if pric == None or pric == []:
                            price = u"¥0.00"
                        else:
                            try:
                                price = pric[0].xpath("a/span")[0].text
                            except:
                                print url
                                print "出错价格" + pric

                        if pric != None and pric != [] and pric[0].xpath(
                                "a/del") != []:
                            # 有原价
                            original_price = pric[0].xpath("a/del")[0].text
                        else:
                            # 如果没有原价,那就可以现价一样
                            original_price = price

                # i+=1
                # 把信息存储到mongodb中
                        data = {
                            'priorcategory': priorcategory,
                            'presentcategory': presentcategory
                        }

                        if price != None and price.strip(
                        ) != '' and pric != [] and pric[0] != '':

                            # self.logger.info("价格:"+price)
                            # 把信息存储到cassandra中
                            try:
                                float(price.strip()[1:].replace(",", ""))
                                # float(original_price.strip()[1:].replace(",","")
                            except:
                                self.logger.error("错误price:" + price)
                                self.logger.error("错误price:" + original_price)

                            crawl_data = {
                                # 'id': uuid.uuid1(),
                                'source_id':
                                t.get("href"),
                                'source':
                                source,
                                'summary': {},
                                'title':
                                t.xpath("span")[0].text,
                                'adword':
                                '',
                                'price':
                                float(price.strip()[1:].replace(",", "")),
                                'original_price':
                                float(original_price.strip()[1:].replace(
                                    ",", "")),
                                'score':
                                0
                            }

                            crawl_data.update(category_data)
                            # 保存到cassandra数据库中category_data
                            model = EcBasicModel(crawl_data)
                            export(model)
                            data["uuid"] = model["id"]

                            # print "执行存储cassandra...."
                            Scheduler.schedule(DetailCrawler.type,
                                               key=t.get("href"),
                                               data=data)
                            Scheduler.schedule(CommentCrawler.type,
                                               key=t.get("href"),
                                               data=data)
                    # print repr(json.dumps(crawl_data))
            page += 1
コード例 #33
0
ファイル: weibo.py プロジェクト: xxguo/crawler
 def crawl(self): 
     key = str(self.key)
     data = self.data
     homepage = "http://api.weibo.cn/2/cardlist?\
                 gsid=_2A254IZdKDeTxGeRM7lUR8CnKyT2IHXVZdq2CrDV6PUJbrdAKLUf7kWptw4_No8F1OjQMCarBH4hZxZcrwA..&\
                 wm=3333_2001&i=27bd163&b=1&from=1051293010&c=iphone&v_p=18&skin=default&\
                 v_f=1&s=d2672a12&lang=zh_CN&ua=iPhone7,2__weibo__5.1.2__iphone__os8.1.3&\
                 uicode=10000198&featurecode=10000085&luicode=10000003&count=20&\
                 extparam=100103type=1&cuid=2257007621&sid=t_wap_ios&category=1&\
                 pos=1_-1&wm=3333_2001&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&\
                 fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&lfid=100103type%3D1&\
                 sourcetype=page&lcardid=user&page=1"
     # homepage = "http://api.weibo.cn/2/guest/cardlist?gsid=4wMJ47123kZuG0fKGxlRC15McKa50&uid=1001503246310&\
     #             wm=3333_2001&i=27bd163&b=0&from=1052093010&checktoken=c54259b09129d101b9669b5d93a04c0e&c=iphone&\
     #             v_p=18&skin=default&v_f=1&s=8a12fc6c&did=38d63734cc7427ebb2cb77612c1948cf&lang=zh_CN&ua=iPhone7,\
     #             2__weibo__5.2.0__iphone__os8.2&uid=1001503246310&extparam=100103\
     #             type%3D1%26q%3D%E5%8C%97%E4%BA%AC%E5%AE%89%E7%9B%91%26t%3D0%26sid%3Dt_wap_ios%26category%3D1%26pos%3D1_-1%26wm%3D3333_2001&\
     #             count=20&luicode=10000003&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&featurecode=10000085&\
     #             uicode=10000198&fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&checktoken=\
     #             c54259b09129d101b9669b5d93a04c0e&did=38d63734cc7427ebb2cb77612c1948cf&page=1"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)     
     json_stream = change_to_json(str(html_stream.text))
     cards = json_stream['cards']
     for item in cards:
         scheme = re.search(r'=(.+?)$', item.get('scheme',''))
         scheme = scheme.group(1) if scheme else ''
         url = "http://weibo.com/%s/%s?type=comment"%(data.get('id', ''),
                  scheme)
         item = item.get('mblog',{})
         item = item.get('retweeted_status',item)
         text = item.get('text','')
         title = re.search(ur'【(.+?)】', text)
         title = title.group(1) if title else ''
         if not title:
             title = re.search(ur'#(.+?)#', text)
             title = title.group(1) if title else text[0:20]+'...'
         subtitle = re.search(ur'#(.+?)#', text)           
         subtitle = subtitle.group(1) if subtitle else ''
         pubtime = item.get('created_at', '')
         pubtime = HandleContent.strformat(str(pubtime))
         reposts_count = item.get('reposts_count', '')
         comments_count = item.get('comments_count', '')
         attitudes_count = item.get('attitudes_count', '')
         thumbnail_pic = item.get('thumbnail_pic', '')
         bmiddle_pic = item.get('bmiddle_pic', '')
         original_pic = item.get('original_pic', '')
         mid = item.get('mid', '')
         author = item.get('user',{}).get('name','')
         comment = {}
         comment = {
             'reposts_count': str(reposts_count),
             'attitudes_count': str(attitudes_count),
             'comments_count': str(comments_count)
         }
         crawl_data = {}
         subtitles = []
         subtitles.append(subtitle)
         date = new_time()
         crawl_data = {
             'province': self.data.get('province',''),
             'city': self.data.get('city',''),
             'district': self.data.get('district',''),
             'url': url,
             'title': title,
             'subtitle': subtitles,
             'content': text,
             'pubtime': pubtime,
             'crtime_int': date.get('crtime_int'),
             'crtime': date.get('crtime'),
             'source': 'weibo',
             'publisher': self.data.get('publisher',''),
             'author': author,
             'origin_source': u'新浪微博',
             'type': u'微博',
             'comment': comment
         }
         model = WeiboArticleModel(crawl_data)
         if export(model):
             againt_data = {}
             againt_data = {
                 'wid': model['id'],
                 'type': u'微博',
                 'expire': date.get('crtime_int')/1000000 + 604800,
             }
             Scheduler.schedule(AgainCrawler.type, key=mid, data=againt_data,
                              reset=True, interval=21600)
         else:
             pass
コード例 #34
0
ファイル: amazon.py プロジェクト: xxguo/crawler
    def crawl(self):

        #商品id, 需要获取
        goodid = self.data['uuid']
       # goodid="7ebd0a6a-7b5c-11e4-85d7-00e06668ddd1"
        source = "amazon"

        url = self.key
        source_id = url
        category_data = extract_category(self)

        count = getCommSize(self.get_url(url, 1))  # 页数初始值为3
        page = 1  # 从第一页开始

        while page <= count:
            newurl = self.get_url(url, page)
            print newurl
        # productReviews

         # 获取该url的流信息
            html_stream = ProcessData.get_web_data(newurl)

        # 获取商品列表的html 信息
            html = etree.HTML(html_stream.text)
            # 获取评论区
            comment = html.xpath("//table[@id='productReviews']//tr/td/div")

            for comitem in comment:
                # None

                 # 评论内容
                item = comitem.xpath("div[@class='reviewText']//text()")

                # 评分
                scoreitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/span/span/span")
                # 发布时间
                pubtimeitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/span[@style='vertical-align:middle;']/nobr")

                # 用户的链接地址
                user_iditem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/div/div[@style='float:left;']/a")

                # 有用信息
                usefulitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']")

                oninfo = ""
                for i in item:
                    oninfo += i

                # 有用和无用信息
                if usefulitem != None and usefulitem != []:
                    tmpuseful = usefulitem[0].text.strip()
                else:
                    tmpuseful = "0"

                if tmpuseful == "":
                    tmpuseful = "0"
                elif tmpuseful != "0":
                    tmpuseful = tmpuseful[0:tmpuseful.index("/")]

                # 日期
                pubtim = datetime.strptime("1971-01-1", '%Y-%m-%d')
                if pubtimeitem != None and pubtimeitem != []:
                    pubtim = datetime.strptime(pubtimeitem[0].text.replace(
                        "年", "-").replace("月", "-").replace("日", ""), '%Y-%m-%d')

                # 把日期的字符串类型,转换成日期类型

                sorce = "0.0"

                if scoreitem != None and scoreitem != []:
                    sorce = scoreitem[0].text[2:-1].strip()
                    # print "评分:"+sorce

              #  print user_iditem
                userid = ''
                if user_iditem != None and user_iditem != []:
                    userid = str(user_iditem[0].get("href"))

                comment_data = {
                    "ecid": goodid,
                    "source_id": source_id,
                    "source": source,
                    "comment_id": "",
                    "pubtime": pubtim,
                    "buytime": pubtim,
                    "score": float(sorce),
                    "user_id": userid,
                    "useful": int(tmpuseful),
                    'reply': 0,
                    "content": oninfo.strip()
                }
#                print comment_data
            # 把原始和现有分类存储到数据库中
                comment_data.update(category_data)

                model = EcCommentModel(comment_data)
                export(model)
            page += 1
コード例 #35
0
    def crawl(self):

        # id号
        ids = self.data['uuid']
        # ids="1dcfa11e-7acf-11e4-b0cc-00e06668ddd1"
        # source_id=""
        # 商品url信息
        url = self.key

        print "url:" + url

        source = "amazon"

        category_data = extract_category(self)

        # 获取该url的流信息
        html_stream = ProcessData.get_web_data(url)

        # 获取商品列表的html 信息
        html = etree.HTML(html_stream.text)

        # 获取商品的详细信息
        prodDetails = html.xpath("//div[@id='prodDetails']")

        if len(prodDetails) == 0:
            # 获取模版也具有基本信息的数据
            detailed = getDetailedGoods(type=self.type,
                                        key=self.key,
                                        data=self.data).crawlHtml(html)
        else:
            # 打印商品样式
            style = prodDetails[0].xpath("div[@class='disclaim']/strong")
            # print style[0].text

            # 获取具体商品信息
            goodinfo = prodDetails[0].xpath(
                "div[@class='wrapper CNlocale']//table/tbody/tr")

            # 商品
            summary = {}
            ecbrands = ""
            ecnames = ""
            introduce = {}

            for info in goodinfo:
                # print
                # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text
                if info.xpath("td[@class='label']") != []:
                    if info.xpath("td[@class='label']")[0].text == "用户评分":
                        summary[info.xpath(
                            "td[@class='label']")[0].text] = info.xpath(
                                "td[@class='value']")[0].xpath(
                                    "//div[@id='averageCustomerReviewRating']"
                                )[0].text.strip()[2:-1]
                    # print
                    # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1]
                    elif info.xpath(
                            "td[@class='label']")[0].text.strip() == "品牌":
                        ecbrands = info.xpath(
                            "td[@class='value']")[0].text.strip()
                    else:
                        summary[info.xpath("td[@class='label']")
                                [0].text] = info.xpath(
                                    "td[@class='value']")[0].text.strip()
                    # print
                    # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text.strip()

                    # 存入cassandra中
            crawl_data = {
                'id': ids,
                'source': source,
                'source_id': url,
                'summary': summary,
                'introduce': introduce,
                'name': ecnames,
                'brand': ecbrands
            }

            crawl_data.update(category_data)
            # print crawl_data
            model = EcDetailModel(crawl_data)
            export(model)
コード例 #36
0
    def crawl(self):
        skulist = []
        goodsNo = str(self.key)
        category_data = extract_category(self)
        url = self.get_detail_url(goodsNo)
        html = ProcessData.get_web_data(url)
        tree = etree.HTML(html.text)
        xpath = {
            "introduce": "//div[@class='guigecanshu']/text()",
            "summary": "//ul[@id='prd_data']/li[2]/ul/li/span/text()",
            # "number": "//span[@class='fr ccc']/text()"
        }

        summary = self.parse_summary(tree, xpath["summary"])
        introduce = self.parse_intr(tree, xpath["introduce"])
        # number =  self.parse_number(tree, xpath["number"])

        version = get_version(summary, introduce)
        series = get_series(summary, introduce)
        brand = get_brand(summary, introduce)

        json = ProcessData.get_json_data(self.get_basic_url(goodsNo))
        isBbc_str = json["isBbc"]
        isBbc = "Y" if isBbc_str == "Y" or isBbc_str == "y" else "N"
        status_str = json["onSale"]
        status = 0 if status_str == "N" or status_str == "n" else 1

        skulist = json['skuList']
        for sku in skulist:
            ecname = sku['skuName']
            ecimglist = sku['skuSourceImgUrl']

        detail_data = {
            'source': self.data.get('source'),
            'source_id': goodsNo,
            'summary': summary,
            'introduce': introduce,
            'name': ecname,
            'images': ecimglist,
            'status': status,
            'brand': brand,
            'version': version,
            'series': series,
            'comment': {
                'is_Bbc': isBbc,
                'skuID': self.data['skuID'],
            },
        }
        detail_data.update(category_data)
        detail_data.update(get_ctime())
        model = EcDetailModel(detail_data)
        export(model)
        comment_data = {
            'uuid': model["id"],
            'brand': brand,
            'version': version,
            'series': series,
            'is_Bbc': isBbc,
            'status': status,
            'priorcategory': self.data['priorcategory'],
            'skuID': self.data['skuID'],
        }
        Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=comment_data)