def get_data(self, response, key): db_agent = DatabaseAgent() s = IndustrialItem() s['url'] = response.url s['title'] = response.xpath('//title/text()').extract()[0] if key == "工业App" and ("工业" not in s['title'] or ("App" not in s['title'] and "APP" not in s['title'] and "app" not in s['title'])): pass else: s['time'] = response.xpath( '//meta[@name="PubDate"]/@content').extract()[0] date = datetime.datetime.strptime(s['time'], "%Y-%m-%d %H:%M") s['time'] = time.mktime(date.timetuple()) s['nature'] = "None" s['area'] = self.area s['origin'] = self.origin s['keyword'] = key try: db_agent.add(kwargs=dict(s), orm_model=Industrial) logging.info("-----------add success------------") # TODO # data = "".join(list(map(clear,response.xpath('//div[@id="zoom"]//text()').extract()))) # with open('./export/anhui/{filename}.html'.format(filename=s['title']), 'w', encoding=("utf8")) as f: # f.write(str(data)) except: logging.info("-----------add error------------") pass yield s
def get_data(self, response, key): s = IndustrialItem() db_agent = DatabaseAgent() for item in range(1, 13): s['title'] = response.xpath( '//div[@class="search-tab-content-list"]/div[@class="search-tab-content-item flex"][{x}]/a/img/@title' .format(x=item)).extract()[0] s['url'] = response.xpath( '//div[@class="search-tab-content-list"]/div[@class="search-tab-content-item flex"][{x}]/a[1]/@href' .format(x=item)).extract()[0] s['url'] = 'http://www.huodongxing.com' + s['url'] s['area'] = self.area s['keyword'] = "工业互联网活动" s['nature'] = "活动" s['origin'] = self.origin s['time'] = response.xpath( '//div[@class="search-tab-content-list"]/div[@class="search-tab-content-item flex"][{x}]/div[@class="search-tab-content-item-right"]/p[@class="item-data flex"]/text()' .format(x=item)).extract()[0].split('-')[0] date = datetime.datetime.strptime(s['time'], "%Y.%m.%d") s['time'] = time.mktime(date.timetuple()) try: db_agent.add(kwargs=dict(s), orm_model=Industrial) logging.info("-----------add success------------") except Exception as e: logging.info(e) logging.info("-----------add error------------") pass yield s
def get_data(self, response, key): db_agent = DatabaseAgent() s = IndustrialItem() for index in range(1, 11): s['url'] = response.xpath( '//div[@class="Main"]/dl[{index}]/dt/a/@href'.format( index=index)).extract() s['title'] = response.xpath( '//div[@class="Main"]/dl[{index}]/dt/a//text()'.format( index=index)).extract() s['title'] = ''.join(s['title']) if not is_exits(s["title"], s["url"]): continue s['area'] = self.area s['origin'] = self.origin s['nature'] = '' s['keyword'] = key s['time'] = response.xpath( '//div[@class="Main"]/dl[{index}]/dd[last()]/span[@style="color:#666"]/text()' .format(index=index)).extract()[0] s["time"] = datetime.datetime.strptime(s["time"], "%Y.%m.%d %H:%M:%S") s['time'] = int(time.mktime(s["time"].timetuple())) try: db_agent.add(kwargs=dict(s), orm_model=Industrial) logging.info("-----------add success------------") except Exception as e: logging.info(e) logging.info("-----------add error------------") pass yield s
def parse(self, response, key): db_agent = DatabaseAgent() s = IndustrialItem() s['url'] = response.url s['title'] = response.xpath('//title/text()').extract()[0] s['title'] = "《上海市工业互联网创新发展专项支持实施细则》的通知" if key == "工业App" and ("工业" not in s['title'] or ("App" not in s['title'] and "APP" not in s['title'] and "app" not in s['title'])): pass else: s['time'] = response.xpath( '//meta[@name="PubDate"]/@content').extract()[0] date = datetime.datetime.strptime(s['time'], "%Y-%m-%d %H:%M") s['time'] = time.mktime(date.timetuple()) s['nature'] = response.xpath( '//meta[@name="ColumnKeywords"]/@content').extract()[0] s['area'] = self.area s['origin'] = self.origin s['keyword'] = key try: db_agent.add(kwargs=dict(s), orm_model=Industrial) logging.info("-----------add success------------") except Exception as e: logging.info(e) logging.info("-----------add error------------") pass yield s
def get_data(self,response): s = IndustrialItem() db_agent = DatabaseAgent() res = response res = json.loads(res.text) if res['data']!=None: for data in res['data']['list']: s["title"] = data["title"] # if "工业互联网" not in s['title']: # continue s["area"] = self.area s["nature"] = "新闻" s["origin"] = "zaoqizhineng" s["time"] = datetime.datetime.strptime(data['ptime'], "%Y-%m-%d %H:%M:%S") s["time"] = int(time.mktime(s["time"].timetuple())) s["url"] = 'http://dy.163.com/v2/article/detail/{docid}.html'.format(docid=data['docid']) s['keyword'] = "工业互联网活动" try: db_agent.add( kwargs=dict(s), orm_model=Industrial ) logging.info("-----------add success------------") except Exception as e: logging.info(e) logging.info("-----------add error------------") pass yield s
def save_data(self, response, key, data): # article = response.xpath() db_agent = DatabaseAgent() s = IndustrialItem() s['title'] = data['NAME'] if key == "工业App" and ("工业" not in s['title'] or ("App" not in s['title'] and "APP" not in s['title'] and "app" not in s['title'])): pass else: pattern = re.compile(r'发布时间:(.*?) ') date = pattern.search(response.body.decode("utf8")).group(1) date = datetime.datetime.strptime(date, "%Y-%m-%d") s['time'] = int(time.mktime(date.timetuple())) s['area'] = self.area s['origin'] = self.origin s['keyword'] = key s['url'] = data['PUBURL'] s['nature'] = data['CHANNELNAME'] if is_exits(s["title"], s["url"]): try: db_agent.add(kwargs=dict(s), orm_model=Industrial) logging.info("-----------add success------------") except Exception as e: logging.info(e) logging.info("-----------add error------------") pass yield s
def get_data(self,response,key): db_agent = DatabaseAgent() s = IndustrialItem() s['url'] = response.url s['title'] = response.xpath('//title/text()').extract()[0] s['time'] = response.xpath('//h5/text()').extract()[0] pattern = re.compile(r'.*(\d\d\d\d-\d\d-\d\d)') s['time'] = pattern.match(s['time']).group(1) date = datetime.datetime.strptime(s['time'], "%Y-%m-%d") s['time'] = time.mktime(date.timetuple()) s['nature'] = response.xpath('//span[@class="where"]/a[last()]/text()').extract()[0] s['area'] = self.area s['origin'] = self.origin s['keyword'] = key s['title'] = s['title'].replace("安徽省经济和信息化委员会 ","") try: if key=="工业App" and ("工业" not in s['title'] or ("App" not in s['title'] and "APP" not in s['title'] and "app" not in s[ 'title'])): pass else: db_agent.add( kwargs=dict(s), orm_model=Industrial ) logging.info("-----------add success------------") # data = "".join(list(map(clear,response.xpath('//div[@id="zoom"]//text()').extract()))) # with open('./export/anhui/{filename}.html'.format(filename=s['title']), 'w', encoding=("utf8")) as f: # f.write(str(data)) except Exception as e: logging.info(e) logging.info("-----------add error------------") pass yield s
def get_data(self,response, key): # a = response.body.decode('utf8') # print(a) # print('-------------') db_agent = DatabaseAgent() s = IndustrialItem() for index in range(1,21): print(index) s['url'] = response.xpath('//ul[last()]/li[{index}]/dl[@class="result_text"]/dt/a/@href'.format(index=index)).extract() s['title'] = response.xpath('//ul[last()]/li[{index}]/dl[@class="result_text"]/dt/a/i//text()'.format(index=index)).extract() s['title'] = ''.join(s['title']) if not is_exits(s["title"], s["url"]): continue s["time"] = clear(response.xpath('//ul[last()]/li[{index}]/dl[@class="result_text"]/dt/p/text()'.format(index=index)).extract()[0]) s["time"] = datetime.datetime.strptime(s["time"], "%Y-%m-%d") s['time'] = int(time.mktime(s["time"].timetuple())) s['area'] = self.area s['origin'] = self.origin s['nature'] = '' s['keyword'] = key try: db_agent.add( kwargs=dict(s), orm_model=Industrial ) logging.info("-----------add success------------") except: logging.info("-----------add error------------") pass yield s
def get_url(self, response, key): db_agent = DatabaseAgent() urls = response.xpath( '//div[@class="jsearch-result-url"]/a/text()').extract() for url in urls: url_exits = db_agent.get(orm_model=Industrial, filter_kwargs={"url": url}) if url_exits: logging.info("-----------already exits------------") continue yield scrapy.Request(url=url, headers=self.header, callback=lambda response, key=key: self. get_data(response, key))
def get_url(self,response, key): s = IndustrialItem() db_agent = DatabaseAgent() urls = response.xpath('//ul[@class="download_list"]/li/a/@href').extract() for url in urls: url_exits = db_agent.get( orm_model=Industrial, filter_kwargs={"url": url} ) if url_exits: logging.info("-----------already exits------------") continue date = response.xpath('//a[@href="{url}"]/div/text()'.format(url=url)).extract()[0] date = date.replace(' ', '') date = datetime.datetime.strptime(date, "%Y.%m.%d") s['title'] = response.xpath('//a[@href="{url}"]/h2/text()'.format(url=url)).extract()[0] if key == "工业App" and ("工业" not in s['title'] or ("App" not in s['title'] and "APP" not in s[ 'title'] and "app" not in s['title'])): add = False else: s['time'] = time.mktime(date.timetuple()) s['nature'] = "None" s['area'] = self.area s['origin'] = self.origin s['url'] = url s['keyword'] = key try: db_agent.add( kwargs=dict(s), orm_model=Industrial ) logging.info("-----------add success------------") add = True except Exception as e: logging.info(e) logging.info("-----------add error------------") add = False # if add: # res = requests.get( # url=url, # headers=self.header, # ) # res = res.content # selector = etree.HTML(res) # data = "".join(list(map(clear,selector.xpath('//div[@class="inside_content_text"]//text()')))) # with open('./export/chanyelianmeng/{filename}.html'.format(filename=s['title']), 'w', # encoding=("utf8")) as f: # f.write(str(data)) yield s
def get_url(self, response, key): db_agent = DatabaseAgent() urls = response.xpath( '//ul[@class="list clearfix"]/div[@class="li"]/p/a/@href').extract( ) for url in urls: url_exits = db_agent.get(orm_model=Industrial, filter_kwargs={"url": url}) print(key) if url_exits: logging.info("-----------already exits------------") continue yield scrapy.Request( url=url, headers=self.header, callback=lambda response, key=key: self.parse(response, key))
def get_url(self,response,key): db_agent = DatabaseAgent() urls = response.xpath('//div[@class="nest"]/p/a/@href').extract() for url in urls: url = 'http://www.aheic.gov.cn/'+url url_exits = db_agent.get( orm_model=Industrial, filter_kwargs={"url": url} ) if url_exits: logging.info("-----------already exits------------") continue yield scrapy.Request( url=url, headers=self.header, callback=lambda response,key=key: self.get_data(response,key) )
def get_data(self, response, key): data = json.loads(response.body) s = IndustrialItem() db_agent = DatabaseAgent() for item in data['events']: s['title'] = item['event_name'] s['url'] = 'https://www.huodongjia.com' + item['event_url'] s['area'] = self.area s['keyword'] = "工业互联网活动" s['nature'] = "活动" s['origin'] = self.origin date = datetime.datetime.strptime(item['event_begin_time'], "%Y-%m-%d") s['time'] = int(time.mktime(date.timetuple())) try: db_agent.add(kwargs=dict(s), orm_model=Industrial) logging.info("-----------add success------------") except Exception as e: logging.info(e) logging.info("-----------add error------------") pass yield s
def get_url(self, response, key): data = json.loads(response.body) data = data["array"] db_agent = DatabaseAgent() s = IndustrialItem() for item in data: url_exits = db_agent.get(orm_model=Industrial, filter_kwargs={"url": item["url"]}) title = item["name"].replace("<font color='red'>", "").replace( "</font>", "").replace("<nobr>", "").replace("</nobr>", "") if key == "工业App" and ("工业" not in title or ("App" not in title and "APP" not in title and "app" not in title)): logging.info("-----------工业App not in article------------") continue title_exits = db_agent.get(orm_model=Industrial, filter_kwargs={"title": title}) if title_exits: logging.info("-----------already exits------------") continue s["url"] = item["url"] s['title'] = title s['area'] = self.area date = datetime.datetime.strptime(item["showTime"], "%Y-%m-%d") s['time'] = time.mktime(date.timetuple()) s['origin'] = self.origin s['nature'] = "None" s['keyword'] = key try: db_agent.add(kwargs=dict(s), orm_model=Industrial) logging.info("-----------add success------------") add = True except: logging.info("-----------add error------------") add = False # if add: # res = requests.get( # url=item["url"], # headers=self.header, # ) # res = res.content # selector = etree.HTML(res) # data = selector.xpath('//div[@class="content"]//text()') # if len(data) == 0: # data = selector.xpath('//div[@id="con_con"]//text()') # data = "".join(list(map(clear,data))) # with open('./export/gongxinbu/{filename}.html'.format(filename=s['title']), 'w', # encoding=("utf8")) as f: # f.write(str(data)) yield s