コード例 #1
0
 def get_data(self, response, key):
     db_agent = DatabaseAgent()
     s = IndustrialItem()
     s['url'] = response.url
     s['title'] = response.xpath('//title/text()').extract()[0]
     if key == "工业App" and ("工业" not in s['title'] or
                            ("App" not in s['title']
                             and "APP" not in s['title']
                             and "app" not in s['title'])):
         pass
     else:
         s['time'] = response.xpath(
             '//meta[@name="PubDate"]/@content').extract()[0]
         date = datetime.datetime.strptime(s['time'], "%Y-%m-%d %H:%M")
         s['time'] = time.mktime(date.timetuple())
         s['nature'] = "None"
         s['area'] = self.area
         s['origin'] = self.origin
         s['keyword'] = key
         try:
             db_agent.add(kwargs=dict(s), orm_model=Industrial)
             logging.info("-----------add success------------")
             # TODO
             # data = "".join(list(map(clear,response.xpath('//div[@id="zoom"]//text()').extract())))
             # with open('./export/anhui/{filename}.html'.format(filename=s['title']), 'w', encoding=("utf8")) as f:
             #     f.write(str(data))
         except:
             logging.info("-----------add error------------")
             pass
     yield s
コード例 #2
0
 def get_data(self, response, key):
     s = IndustrialItem()
     db_agent = DatabaseAgent()
     for item in range(1, 13):
         s['title'] = response.xpath(
             '//div[@class="search-tab-content-list"]/div[@class="search-tab-content-item flex"][{x}]/a/img/@title'
             .format(x=item)).extract()[0]
         s['url'] = response.xpath(
             '//div[@class="search-tab-content-list"]/div[@class="search-tab-content-item flex"][{x}]/a[1]/@href'
             .format(x=item)).extract()[0]
         s['url'] = 'http://www.huodongxing.com' + s['url']
         s['area'] = self.area
         s['keyword'] = "工业互联网活动"
         s['nature'] = "活动"
         s['origin'] = self.origin
         s['time'] = response.xpath(
             '//div[@class="search-tab-content-list"]/div[@class="search-tab-content-item flex"][{x}]/div[@class="search-tab-content-item-right"]/p[@class="item-data flex"]/text()'
             .format(x=item)).extract()[0].split('-')[0]
         date = datetime.datetime.strptime(s['time'], "%Y.%m.%d")
         s['time'] = time.mktime(date.timetuple())
         try:
             db_agent.add(kwargs=dict(s), orm_model=Industrial)
             logging.info("-----------add success------------")
         except Exception as e:
             logging.info(e)
             logging.info("-----------add error------------")
             pass
     yield s
コード例 #3
0
    def get_data(self, response, key):

        db_agent = DatabaseAgent()
        s = IndustrialItem()
        for index in range(1, 11):
            s['url'] = response.xpath(
                '//div[@class="Main"]/dl[{index}]/dt/a/@href'.format(
                    index=index)).extract()
            s['title'] = response.xpath(
                '//div[@class="Main"]/dl[{index}]/dt/a//text()'.format(
                    index=index)).extract()
            s['title'] = ''.join(s['title'])
            if not is_exits(s["title"], s["url"]):
                continue
            s['area'] = self.area
            s['origin'] = self.origin
            s['nature'] = ''
            s['keyword'] = key
            s['time'] = response.xpath(
                '//div[@class="Main"]/dl[{index}]/dd[last()]/span[@style="color:#666"]/text()'
                .format(index=index)).extract()[0]
            s["time"] = datetime.datetime.strptime(s["time"],
                                                   "%Y.%m.%d %H:%M:%S")
            s['time'] = int(time.mktime(s["time"].timetuple()))
            try:
                db_agent.add(kwargs=dict(s), orm_model=Industrial)
                logging.info("-----------add success------------")
            except Exception as e:
                logging.info(e)
                logging.info("-----------add error------------")
                pass
        yield s
コード例 #4
0
 def parse(self, response, key):
     db_agent = DatabaseAgent()
     s = IndustrialItem()
     s['url'] = response.url
     s['title'] = response.xpath('//title/text()').extract()[0]
     s['title'] = "《上海市工业互联网创新发展专项支持实施细则》的通知"
     if key == "工业App" and ("工业" not in s['title'] or
                            ("App" not in s['title']
                             and "APP" not in s['title']
                             and "app" not in s['title'])):
         pass
     else:
         s['time'] = response.xpath(
             '//meta[@name="PubDate"]/@content').extract()[0]
         date = datetime.datetime.strptime(s['time'], "%Y-%m-%d %H:%M")
         s['time'] = time.mktime(date.timetuple())
         s['nature'] = response.xpath(
             '//meta[@name="ColumnKeywords"]/@content').extract()[0]
         s['area'] = self.area
         s['origin'] = self.origin
         s['keyword'] = key
         try:
             db_agent.add(kwargs=dict(s), orm_model=Industrial)
             logging.info("-----------add success------------")
         except Exception as e:
             logging.info(e)
             logging.info("-----------add error------------")
             pass
     yield s
コード例 #5
0
 def get_data(self,response):
     s = IndustrialItem()
     db_agent = DatabaseAgent()
     res = response
     res = json.loads(res.text)
     if res['data']!=None:
         for data in res['data']['list']:
             s["title"] = data["title"]
             # if "工业互联网" not in s['title']:
             #     continue
             s["area"] = self.area
             s["nature"] = "新闻"
             s["origin"] = "zaoqizhineng"
             s["time"] = datetime.datetime.strptime(data['ptime'], "%Y-%m-%d %H:%M:%S")
             s["time"] = int(time.mktime(s["time"].timetuple()))
             s["url"] = 'http://dy.163.com/v2/article/detail/{docid}.html'.format(docid=data['docid'])
             s['keyword'] = "工业互联网活动"
             try:
                 db_agent.add(
                     kwargs=dict(s),
                     orm_model=Industrial
                 )
                 logging.info("-----------add success------------")
             except Exception as e:
                 logging.info(e)
                 logging.info("-----------add error------------")
                 pass
         yield s
コード例 #6
0
 def save_data(self, response, key, data):
     # article = response.xpath()
     db_agent = DatabaseAgent()
     s = IndustrialItem()
     s['title'] = data['NAME']
     if key == "工业App" and ("工业" not in s['title'] or
                            ("App" not in s['title']
                             and "APP" not in s['title']
                             and "app" not in s['title'])):
         pass
     else:
         pattern = re.compile(r'发布时间:(.*?) ')
         date = pattern.search(response.body.decode("utf8")).group(1)
         date = datetime.datetime.strptime(date, "%Y-%m-%d")
         s['time'] = int(time.mktime(date.timetuple()))
         s['area'] = self.area
         s['origin'] = self.origin
         s['keyword'] = key
         s['url'] = data['PUBURL']
         s['nature'] = data['CHANNELNAME']
         if is_exits(s["title"], s["url"]):
             try:
                 db_agent.add(kwargs=dict(s), orm_model=Industrial)
                 logging.info("-----------add success------------")
             except Exception as e:
                 logging.info(e)
                 logging.info("-----------add error------------")
                 pass
     yield s
コード例 #7
0
ファイル: anhui.py プロジェクト: donggangcj/industrial_scrapy
 def get_data(self,response,key):
     db_agent = DatabaseAgent()
     s = IndustrialItem()
     s['url'] = response.url
     s['title'] = response.xpath('//title/text()').extract()[0]
     s['time'] = response.xpath('//h5/text()').extract()[0]
     pattern = re.compile(r'.*(\d\d\d\d-\d\d-\d\d)')
     s['time'] = pattern.match(s['time']).group(1)
     date = datetime.datetime.strptime(s['time'], "%Y-%m-%d")
     s['time'] = time.mktime(date.timetuple())
     s['nature'] = response.xpath('//span[@class="where"]/a[last()]/text()').extract()[0]
     s['area'] = self.area
     s['origin'] = self.origin
     s['keyword'] = key
     s['title'] = s['title'].replace("安徽省经济和信息化委员会 ","")
     try:
         if key=="工业App" and ("工业" not in s['title'] or ("App" not in s['title'] and "APP" not in s['title'] and "app" not in s[
             'title'])):
             pass
         else:
             db_agent.add(
                 kwargs=dict(s),
                 orm_model=Industrial
             )
             logging.info("-----------add success------------")
             # data = "".join(list(map(clear,response.xpath('//div[@id="zoom"]//text()').extract())))
             # with open('./export/anhui/{filename}.html'.format(filename=s['title']), 'w', encoding=("utf8")) as f:
             #     f.write(str(data))
     except Exception as e:
         logging.info(e)
         logging.info("-----------add error------------")
         pass
     yield s
コード例 #8
0
 def get_data(self,response, key):
     # a = response.body.decode('utf8')
     # print(a)
     # print('-------------')
     db_agent = DatabaseAgent()
     s = IndustrialItem()
     for index in range(1,21):
         print(index)
         s['url'] = response.xpath('//ul[last()]/li[{index}]/dl[@class="result_text"]/dt/a/@href'.format(index=index)).extract()
         s['title'] = response.xpath('//ul[last()]/li[{index}]/dl[@class="result_text"]/dt/a/i//text()'.format(index=index)).extract()
         s['title'] = ''.join(s['title'])
         if not is_exits(s["title"], s["url"]):
             continue
         s["time"] = clear(response.xpath('//ul[last()]/li[{index}]/dl[@class="result_text"]/dt/p/text()'.format(index=index)).extract()[0])
         s["time"] = datetime.datetime.strptime(s["time"], "%Y-%m-%d")
         s['time'] = int(time.mktime(s["time"].timetuple()))
         s['area'] = self.area
         s['origin'] = self.origin
         s['nature'] = ''
         s['keyword'] = key
         try:
             db_agent.add(
                 kwargs=dict(s),
                 orm_model=Industrial
             )
             logging.info("-----------add success------------")
         except:
             logging.info("-----------add error------------")
             pass
     yield s
コード例 #9
0
 def get_url(self, response, key):
     db_agent = DatabaseAgent()
     urls = response.xpath(
         '//div[@class="jsearch-result-url"]/a/text()').extract()
     for url in urls:
         url_exits = db_agent.get(orm_model=Industrial,
                                  filter_kwargs={"url": url})
         if url_exits:
             logging.info("-----------already exits------------")
             continue
         yield scrapy.Request(url=url,
                              headers=self.header,
                              callback=lambda response, key=key: self.
                              get_data(response, key))
コード例 #10
0
 def get_url(self,response, key):
     s = IndustrialItem()
     db_agent = DatabaseAgent()
     urls = response.xpath('//ul[@class="download_list"]/li/a/@href').extract()
     for url in urls:
         url_exits = db_agent.get(
             orm_model=Industrial,
             filter_kwargs={"url": url}
         )
         if url_exits:
             logging.info("-----------already exits------------")
             continue
         date = response.xpath('//a[@href="{url}"]/div/text()'.format(url=url)).extract()[0]
         date = date.replace(' ', '')
         date = datetime.datetime.strptime(date, "%Y.%m.%d")
         s['title'] = response.xpath('//a[@href="{url}"]/h2/text()'.format(url=url)).extract()[0]
         if key == "工业App" and ("工业" not in s['title'] or ("App" not in s['title'] and "APP" not in s[
             'title'] and "app" not in s['title'])):
             add = False
         else:
             s['time'] = time.mktime(date.timetuple())
             s['nature'] = "None"
             s['area'] = self.area
             s['origin'] = self.origin
             s['url'] = url
             s['keyword'] = key
             try:
                 db_agent.add(
                     kwargs=dict(s),
                     orm_model=Industrial
                 )
                 logging.info("-----------add success------------")
                 add = True
             except Exception as e:
                 logging.info(e)
                 logging.info("-----------add error------------")
                 add = False
         # if add:
         #     res = requests.get(
         #         url=url,
         #         headers=self.header,
         #     )
         #     res = res.content
         #     selector = etree.HTML(res)
         #     data = "".join(list(map(clear,selector.xpath('//div[@class="inside_content_text"]//text()'))))
         #     with open('./export/chanyelianmeng/{filename}.html'.format(filename=s['title']), 'w',
         #               encoding=("utf8")) as f:
         #         f.write(str(data))
         yield s
コード例 #11
0
 def get_url(self, response, key):
     db_agent = DatabaseAgent()
     urls = response.xpath(
         '//ul[@class="list clearfix"]/div[@class="li"]/p/a/@href').extract(
         )
     for url in urls:
         url_exits = db_agent.get(orm_model=Industrial,
                                  filter_kwargs={"url": url})
         print(key)
         if url_exits:
             logging.info("-----------already exits------------")
             continue
         yield scrapy.Request(
             url=url,
             headers=self.header,
             callback=lambda response, key=key: self.parse(response, key))
コード例 #12
0
ファイル: anhui.py プロジェクト: donggangcj/industrial_scrapy
 def get_url(self,response,key):
     db_agent = DatabaseAgent()
     urls = response.xpath('//div[@class="nest"]/p/a/@href').extract()
     for url in urls:
         url = 'http://www.aheic.gov.cn/'+url
         url_exits = db_agent.get(
             orm_model=Industrial,
             filter_kwargs={"url": url}
         )
         if url_exits:
             logging.info("-----------already exits------------")
             continue
         yield scrapy.Request(
             url=url,
             headers=self.header,
             callback=lambda response,key=key: self.get_data(response,key)
         )
コード例 #13
0
 def get_data(self, response, key):
     data = json.loads(response.body)
     s = IndustrialItem()
     db_agent = DatabaseAgent()
     for item in data['events']:
         s['title'] = item['event_name']
         s['url'] = 'https://www.huodongjia.com' + item['event_url']
         s['area'] = self.area
         s['keyword'] = "工业互联网活动"
         s['nature'] = "活动"
         s['origin'] = self.origin
         date = datetime.datetime.strptime(item['event_begin_time'],
                                           "%Y-%m-%d")
         s['time'] = int(time.mktime(date.timetuple()))
         try:
             db_agent.add(kwargs=dict(s), orm_model=Industrial)
             logging.info("-----------add success------------")
         except Exception as e:
             logging.info(e)
             logging.info("-----------add error------------")
             pass
     yield s
コード例 #14
0
    def get_url(self, response, key):
        data = json.loads(response.body)
        data = data["array"]
        db_agent = DatabaseAgent()
        s = IndustrialItem()
        for item in data:
            url_exits = db_agent.get(orm_model=Industrial,
                                     filter_kwargs={"url": item["url"]})
            title = item["name"].replace("<font color='red'>", "").replace(
                "</font>", "").replace("<nobr>", "").replace("</nobr>", "")
            if key == "工业App" and ("工业" not in title or
                                   ("App" not in title and "APP" not in title
                                    and "app" not in title)):
                logging.info("-----------工业App not in article------------")
                continue
            title_exits = db_agent.get(orm_model=Industrial,
                                       filter_kwargs={"title": title})
            if title_exits:
                logging.info("-----------already exits------------")
                continue
            s["url"] = item["url"]
            s['title'] = title
            s['area'] = self.area
            date = datetime.datetime.strptime(item["showTime"], "%Y-%m-%d")
            s['time'] = time.mktime(date.timetuple())
            s['origin'] = self.origin
            s['nature'] = "None"
            s['keyword'] = key
            try:
                db_agent.add(kwargs=dict(s), orm_model=Industrial)
                logging.info("-----------add success------------")
                add = True
            except:
                logging.info("-----------add error------------")
                add = False

            # if add:
            #     res = requests.get(
            #         url=item["url"],
            #         headers=self.header,
            #     )
            #     res = res.content
            #     selector = etree.HTML(res)
            #     data = selector.xpath('//div[@class="content"]//text()')
            #     if len(data) == 0:
            #         data = selector.xpath('//div[@id="con_con"]//text()')
            #     data = "".join(list(map(clear,data)))
            #     with open('./export/gongxinbu/{filename}.html'.format(filename=s['title']), 'w',
            #               encoding=("utf8")) as f:
            #         f.write(str(data))
            yield s