Ejemplos de clear_label en Python, ejemplos de processdata.clear_label en Python

Ejemplo n.º 1

0

Mostrar archivo

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div', 'Index_ShowDetail_Content')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='Index_ShowDetail_Title']/h1/text()"
        xp_putime = "//div[@class='Index_ShowDetail_Time']//text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'全国',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'publisher': u'中国质量新闻网',
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': 'cqn',
            'source_type': u'中国质量新闻网',
            # 'origin_source': u'中国质量新闻网',
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: zjbts.py Proyecto: xxguo/crawler

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div','contaner_nr')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='contaner']/div[@class='contaner_bt']/text()"    
        xp_putime = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'浙江',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'zjbts',
            'publisher': u'浙江质监局',
            'source_type': u'质监局',
        #    'origin_source': u'浙江质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: cqn.py Proyecto: xxguo/crawler

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div','Index_ShowDetail_Content')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='Index_ShowDetail_Title']/h1/text()"    
        xp_putime = "//div[@class='Index_ShowDetail_Time']//text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'全国',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'publisher': u'中国质量新闻网',
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': 'cqn',
            'source_type': u'中国质量新闻网',
           # 'origin_source': u'中国质量新闻网',
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 4

0

Mostrar archivo

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div', 'contaner_nr')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='contaner']/div[@class='contaner_bt']/text()"
        xp_putime = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'浙江',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'zjbts',
            'publisher': u'浙江质监局',
            'source_type': u'质监局',
            #    'origin_source': u'浙江质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 5

0

Mostrar archivo

    def crawl(self):
        homepage = self.key
        data = self.data
        html_stream = _get_url(homepage)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div',class_=['rich_media_content',\
                                'rich_media_thumb_wrp'])
        xp_title = "//div[@class='rich_media_area_primary']/\
                    h2[@class='rich_media_title']/text()"

        xp_putime = "//div/em[@class='rich_media_meta rich_media_meta_text']\
                    /text()"

        xp_author = "//div/em[@class='rich_media_meta rich_media_meta_text'][2]/text()"
        xp_publisher = "//div/a[@id='post-user']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author)
        publisher = HandleContent.get_author(html_stream, xpath=xp_publisher)
        comment = {}
        # con = lambda x, y: x.text.replace('\n','').replace('\r','') + \
        #                     y.text.replace('\n','').replace('\r','')
        # comment['content'] = reduce(con,content)

        content = clear_label(content, root=homepage)
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        date = new_time()
        crawl_data = {}
        crawl_data = {
            'province': self.data.get('province', ''),
            'city': self.data.get('city', ''),
            'district': self.data.get('district', ''),
            'url': homepage,
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': 'sogou',
            'author': author,
            'publisher': self.data.get('publisher', publisher),
            'origin_source': u'微信公共账号',
            'type': u'微信',
            'comment': comment
        }
        if data.get('key'):
            crawl_data.update(data)
            model = SearchArticleModel(crawl_data)
        else:
            model = WeixinArticleModel(crawl_data)

        export(model)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: sogou.py Proyecto: xxguo/crawler

    def crawl(self): 
        homepage = self.key
        data = self.data
        html_stream = _get_url(homepage)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div',class_=['rich_media_content',\
                                'rich_media_thumb_wrp'])
        xp_title = "//div[@class='rich_media_area_primary']/\
                    h2[@class='rich_media_title']/text()"
        xp_putime = "//div/em[@class='rich_media_meta rich_media_meta_text']\
                    /text()"
        xp_author = "//div/em[@class='rich_media_meta rich_media_meta_text'][2]/text()"
        xp_publisher = "//div/a[@id='post-user']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author)
        publisher = HandleContent.get_author(html_stream, xpath=xp_publisher)
        comment = {}
        # con = lambda x, y: x.text.replace('\n','').replace('\r','') + \
        #                     y.text.replace('\n','').replace('\r','')
        # comment['content'] = reduce(con,content)

        content = clear_label(content, root=homepage)
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        date = new_time()
        crawl_data = {}
        crawl_data = {
            'province': self.data.get('province',''),
            'city': self.data.get('city',''),
            'district': self.data.get('district',''),
            'url': homepage,
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': 'sogou',
            'author': author,
            'publisher': self.data.get('publisher', publisher),
            'origin_source': u'微信公共账号',
            'type': u'微信',
            'comment': comment
        }
        if data.get('key'):
            crawl_data.update(data)
            model = SearchArticleModel(crawl_data)
        else:
            model = WeixinArticleModel(crawl_data)

        export(model)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: gzq.py Proyecto: xxguo/crawler

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('td',id='td_news_content')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td[@class='content-title']/div/text()"    
        xp_putime = "//tr/td[@class='bottom-line-gray']/text()"
      #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
      #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'广东',
            'city': u'广州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'gzq',
            'publisher': u'广东广州质监局',
            'source_type': u'质监局',
          #  'origin_source': u'福建质监局',
          #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print title.encode('utf-8'), pubtime
        # print comment['content'].encode('utf-8')
        # print content.encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: fsjsjd.py Proyecto: xxguo/crawler

    def crawl(self):

        url = self.key
        data = self.data
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div',id='right-text_d')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@id='right-title_d']//text()"    
        xp_putime = "//div[@class='article']/p[@class='info']/span/text()"
      #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
      #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'广东',
            'city': u'佛山',
            'title': title,
            'content': content,
            'pubtime': data.get('pubtime', pubtime),
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'fsjsjd',
            'publisher': u'广东佛山质监局',
            'source_type': u'质监局',
          #  'origin_source': u'福建质监局',
          #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print title.encode('utf-8'), '---',crawl_data['pubtime']
        # print comment['content'].encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 9

0

Mostrar archivo

    def crawl(self):

        url = self.key
        data = self.data
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div', id='right-text_d')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@id='right-title_d']//text()"
        xp_putime = "//div[@class='article']/p[@class='info']/span/text()"
        #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'广东',
            'city': u'佛山',
            'title': title,
            'content': content,
            'pubtime': data.get('pubtime', pubtime),
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'fsjsjd',
            'publisher': u'广东佛山质监局',
            'source_type': u'质监局',
            #  'origin_source': u'福建质监局',
            #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print title.encode('utf-8'), '---',crawl_data['pubtime']
        # print comment['content'].encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 10

0

Mostrar archivo

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('td', id='td_news_content')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td[@class='content-title']/div/text()"
        xp_putime = "//tr/td[@class='bottom-line-gray']/text()"
        #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'广东',
            'city': u'广州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'gzq',
            'publisher': u'广东广州质监局',
            'source_type': u'质监局',
            #  'origin_source': u'福建质监局',
            #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print title.encode('utf-8'), pubtime
        # print comment['content'].encode('utf-8')
        # print content.encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 11

0

Mostrar archivo

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div', ['article-box', 'files'])
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='article']/h2/text()|//h3/text()"
        xp_putime = "//div[@class='article']/p[@class='info']/span/text()"
        #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'湖北',
            #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'hbzljd',
            'publisher': u'湖北质监局',
            'source_type': u'质监局',
            #  'origin_source': u'福建质监局',
            #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print title.encode('utf-8'), pubtime
        # print comment['content'].encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 12

0

Mostrar archivo

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('td', 'conzt')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()"
        xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()"
        xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream,
                                          xpath=xp_author,
                                          xp_text=u'来源：')
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'山东',
            #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'sdqts',
            'publisher': u'山东质监局',
            'source_type': u'质监局',
            # 'origin_source': u'山东质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: hbzljd.py Proyecto: xxguo/crawler

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div',['article-box','files'])
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='article']/h2/text()|//h3/text()"    
        xp_putime = "//div[@class='article']/p[@class='info']/span/text()"
      #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
      #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'湖北',
         #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'hbzljd',
            'publisher': u'湖北质监局',
            'source_type': u'质监局',
          #  'origin_source': u'福建质监局',
          #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print title.encode('utf-8'), pubtime
        # print comment['content'].encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 14

0

Mostrar archivo

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div', 'TRS_Editor')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td[@align='center']/h1/text()"
        xp_putime = "//div[@class='xj2']/text()"
        #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'全国',
            #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'aqsiq',
            'publisher': u'国家质量监督检验检疫总局',
            'source_type': u'国家质量监督检验检疫总局',
            #  'origin_source': u'福建质监局',
            #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print '===',pubtime,title.encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: hzqts.py Proyecto: xxguo/crawler

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('span','ny')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td[@class='dhz']/span/text()"    
        xp_putime = "//td/table/tbody/tr/td[@align='center']/span/text()"
       # xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
       # author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'浙江',
            'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'hzqts',
            'publisher': u'浙江杭州质监局',
            'source_type': u'质监局',
         #   'origin_source': u'浙江杭州质监局',
        #    'author': author,
            'type': u'文章',
            'comment': comment,
        }
     #   print content.encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 16

0

Mostrar archivo

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('span', 'ny')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td[@class='dhz']/span/text()"
        xp_putime = "//td/table/tbody/tr/td[@align='center']/span/text()"
        # xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        # author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'浙江',
            'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'hzqts',
            'publisher': u'浙江杭州质监局',
            'source_type': u'质监局',
            #   'origin_source': u'浙江杭州质监局',
            #    'author': author,
            'type': u'文章',
            'comment': comment,
        }
        #   print content.encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: aqsiq.py Proyecto: xxguo/crawler

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('div','TRS_Editor')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td[@align='center']/h1/text()"    
        xp_putime = "//div[@class='xj2']/text()"
      #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
      #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'全国',
         #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'aqsiq',
            'publisher': u'国家质量监督检验检疫总局',
            'source_type': u'国家质量监督检验检疫总局',
          #  'origin_source': u'福建质监局',
          #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        # print '===',pubtime,title.encode('utf-8')
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: sdqts.py Proyecto: xxguo/crawler

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('td','conzt')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()"    
        xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()"
        xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
        author = HandleContent.get_author(html_stream, xpath=xp_author, xp_text=u'来源：')
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'山东',
         #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'sdqts',
            'publisher': u'山东质监局',
            'source_type': u'质监局',
           # 'origin_source': u'山东质监局',
            'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)

Ejemplo n.º 19

0

Mostrar archivo

    def crawl(self):

        url = self.key
        html_stream = _get_url(url)
        soup = HandleContent.get_BScontext(html_stream)
        content = soup.find_all('li','show_con')
        content = clear_label(content, root=url)
        comment = {}
        text = HandleContent.get_BScontext(content, text=True).text
        comment['content'] = clear_space(text)
        xp_title = "//div[@class='xl_content']/h1/text()"    
        xp_putime = "//div[@class='xl_content']/div[@class='time']/text()"
      #  xp_author = "//ul/li[@class='show_date']/text()"
        title = HandleContent.get_title(html_stream, xpath=xp_title)
        pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
      #  author = HandleContent.get_author(html_stream, xpath=xp_author)
        date = new_time()
        crawl_data = {
            'url': url,
            'province': u'福建',
         #   'city': u'杭州',
            'title': title,
            'content': content,
            'pubtime': pubtime,
            'crtime_int': date.get('crtime_int'),
            'crtime': date.get('crtime'),
            'source': u'fjqi',
            'publisher': u'福建质监局',
            'source_type': u'质监局',
          #  'origin_source': u'福建质监局',
          #  'author': author,
            'type': u'文章',
            'comment': comment,
        }
        model = ZjldArticleModel(crawl_data)
        export(model)