def parse_info(self, response): item = LadItem() item["newsType"] = '警事要闻' item["title"] = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[2]/text()').extract_first() item["time"] = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[3]/div[2]/text()').extract_first().split('|')[1].strip() text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/div/div/p/font') if len(text_list) == 0: text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/div/p') if len(text_list) == 0: text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/div/span') if len(text_list) == 0: text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/div') if len(text_list) == 0: text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/p/span') if len(text_list) == 0: text_list = response.xpath('/html/body/div/div/div[2]/div[3]/div/div/div[4]/div[1]/div/span') if len(text_list) >= 2: for str_slt in text_list: if str_slt.xpath('text()').extract_first() is None: self.text = self.text else: self.text = self.text + str_slt.xpath('text()').extract_first() else: if text_list.xpath('text()').extract_first() is None: self.text = self.text else: self.text = self.text + text_list.xpath('text()').extract_first() item["text"] = self.text self.text = "" yield item
def parse_info(self, response): item = LadItem() item["city"] = "湖南" item["newsType"] = "警事要闻" item["title"] = response.xpath( '/html/body/div/div[1]/div[4]/div[1]/h4/text()').extract( )[0].encode('utf-8') item["time"] = response.xpath( '/html/body/div/div[1]/div[4]/div[1]/div/p[2]/text()' ).extract_first().encode('utf-8').split(':')[1] #rows = list(array) text_list = response.xpath('//*[@id="txtContent"]/div/div/div/p') if len(text_list) == 0: text_list = response.xpath('//*[@id="txtContent"]/div/p/span') if len(text_list) == 0: text_list = response.xpath('//*[@id="content"]/div/div/p') if len(text_list) == 0: text_list = response.xpath('//*[@id="txtContent"]/div/p') if len(text_list) == 0: text_list = response.xpath('//*[@id="artibody"]/p') if len(text_list) == 0: text_list = response.xpath('//*[@id="txtContent"]/div/div/p') if len(text_list) == 0: text_list = response.xpath('//*[@id="txtContent"]/p') for str_slt in text_list: if str_slt.xpath('text()').extract_first() is None: self.text = self.text else: self.text = self.text + str_slt.xpath('text()').extract_first() item["text"] = self.text self.text = "" yield item
def parse_info(self, response): item = LadItem() item["city"] = "海南" item["newsType"] = "警事要闻" item["title"] = response.xpath('//*[@id="artibody"]/table/tr[1]/td/font/text()').extract_first() item["time"] = '2015-9-24' text_list = response.xpath('//*[@id="artibody"]/table/tr[4]/td') if len(text_list) >=2: for str_slt in text_list: if str_slt.xpath('text()').extract_first() is None: self.text = self.text else: self.text = self.text + str_slt.xpath('text()').extract_first() else: if text_list.xpath('text()').extract_first() is None: self.text = self.text else: self.text = self.text + text_list.xpath('text()').extract_first() item["text"] = self.text self.text = "" yield item
def parse_info(self, response): item = LadItem() item["city"] = "西藏" item['newsType'] = '警事要闻' item["title"] = response.xpath( '//*[@id="container"]/div[2]/div/div/div[2]/div[1]/h1/text()' ).extract_first() if response.xpath( '//*[@id="container"]/div[2]/div/div/div[2]/div[2]/span[2]/text()' ) is None: item["time"] = response.xpath( '//*[@id="container"]/div[2]/div/div/div/div[2]/div[2]/span[2]/text()' ).extract_first().split(' ')[0][5:15] else: item["time"] = response.xpath( '//*[@id="container"]/div[2]/div/div/div[2]/div[2]/span[2]/text()' ).extract_first().split(' ')[0][5:15] text_list = response.xpath( '//*[@id="container"]/div[2]/div/div/div[3]/p/span') for str_slt in text_list: if str_slt.xpath('text()').extract_first() is None: self.text = self.text else: self.text = self.text + str_slt.xpath('text()').extract_first() item["text"] = self.text self.text = "" yield item
def parse_info(self, response): item = LadItem() item["city"] = "新疆" item['newsType'] = '警事要闻' item["title"] = response.xpath( '/html/body/div[1]/div[2]/div[2]/div/div/div[1]/text()' ).extract_first() c = response.xpath( '/html/body/div[1]/div[2]/div[2]/div/div/div[2]/text()' ).extract_first().strip().split(' ')[0] c = re.sub("\D", "", c) item["time"] = c[0:4] + '-' + c[4:6] + '-' + c[6:8] if len( response.xpath( '/html/body/div[1]/div[2]/div[2]/div/div/div[2]/text()'). extract_first().strip().split(' ')[0]) == 0: item["time"] = response.xpath( '//*[@id="right"]/div[1]/div[1]/div[1]/text()').extract_first( ).strip().split(' ')[0] item["title"] = response.xpath( '//*[@id="right"]/div[1]/div[1]/h3/text()').extract_first() text_list = response.xpath( '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/p/span') if len(text_list) <= 1: text_list = response.xpath( '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/p/font') if len(text_list) <= 1: text_list = response.xpath( '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/p') if len(text_list) <= 1: text_list = response.xpath('//*[@id="right"]/div[1]/div[2]/p') if len(text_list) <= 1: text_list = response.xpath( '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/table/tbody/tr/td/p/span' ) if len(text_list) <= 1: text_list = response.xpath( '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/p/span') if len(text_list) <= 1: text_list = response.xpath( '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/span') if len(text_list) > 1: self.text = processText(text_list) else: self.text = response.xpath( '/html/body/div[1]/div[2]/div[2]/div/div/div[5]/span/text()' ).extract_first() item["text"] = self.text self.text = "" yield item
def parse(self, response): should_deep = True times = response.xpath( '/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/table/tr/td/div/text()' ).extract() urls = response.xpath( '/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/table/tr/td/div/a/@href' ).extract() valid_child_urls = list() for time, url in zip(times, urls): try: time_now = datetime.strptime(time, '%Y-%m-%d') self.update_last_time(time_now) except: break if self.last_time is not None and self.last_time >= time_now: should_deep = False break valid_child_urls.append("http://www.njga.gov.cn/www/njga/2010/" + url) next_requests = list() if should_deep: # 表示有新的url # 翻页 if len(response.url) == 45: next_url = 'http://www.njga.gov.cn/www/njga/2010/zabb_p1.htm' else: num = int(response.url.split('/')[6][6]) next_url = 'http://www.njga.gov.cn/www/njga/2010/zabb_p' + str( num + 1) + ".htm" yield scrapy.Request(url=next_url, callback=self.parse) next_requests.append( scrapy.Request(url=next_url, callback=self.parse)) for index, temp_url in enumerate(valid_child_urls): req = scrapy.Request(url=temp_url, callback=self.parse_info) hit_time = times[index] m_item = LadItem() m_item['time'] = hit_time # 相当于在request中加入了item这个元素 req.meta['item'] = m_item next_requests.append(req) for req in next_requests: yield req
def parse(self, response): should_deep = True times = response.xpath('//*[@width="200px"]/text()').extract() urls = response.xpath( '/html/body/div[3]/div/div/div/div[3]/div/table/tbody/tr/td/div/a/@href' ).extract() valid_child_urls = list() for time, url in zip(times, urls): try: time_now = datetime.strptime(time[1:11], '%Y-%m-%d') self.update_last_time(time_now) except: break if self.last_time is not None and self.last_time >= time_now: should_deep = False break valid_child_urls.append("http://www.jsga.gov.cn" + url) next_requests = list() if should_deep: # 表示有新的url # 翻页 if len(response.url) == 43: next_url = "http://www.jsga.gov.cn/jwzx/aqff/index_2.html" else: num = int(response.url.split('index')[1][1]) next_url_part = "index_" + str(num + 1) + ".html" next_url = "http://www.jsga.gov.cn/jwzx/aqff/" + next_url_part yield scrapy.Request(url=next_url, callback=self.parse) next_requests.append( scrapy.Request(url=next_url, callback=self.parse)) for index, temp_url in enumerate(valid_child_urls): req = scrapy.Request(url=temp_url, callback=self.parse_info) hit_time = times[index] m_item = LadItem() m_item['time'] = hit_time # 相当于在request中加入了item这个元素 req.meta['item'] = m_item next_requests.append(req) for req in next_requests: yield req
def parse(self, response): should_deep = True times = response.xpath( '//*[@class="lists"]/ul/li/span/text()').extract() urls = response.xpath('//*[@class="lists"]/ul/li/a/@href').extract() valid_child_urls = list() for time, url in zip(times, urls): try: time_now = datetime.strptime(time[1:11], '%Y-%m-%d') self.update_last_time(time_now) except: break if self.last_time is not None and self.last_time >= time_now: should_deep = False break valid_child_urls.append('http://gaj.km.gov.cn' + url) next_requests = list() if should_deep: # 表示有新的url # 翻页 if len(response.url) == 31: next_url = "http://gaj.km.gov.cn/zxdt/jwdt/index_2.shtml" else: part_str = response.url.split('/')[5] num = int(part_str[6]) next_url_part = "index_" + str(num + 1) + ".shtml" next_url = response.url.split('index')[0] + next_url_part next_requests.append( scrapy.Request(url=next_url, callback=self.parse)) for index, temp_url in enumerate(valid_child_urls): req = scrapy.Request(url=temp_url, callback=self.parse_info) hit_time = times[index] m_item = LadItem() m_item['time'] = hit_time # 相当于在request中加入了item这个元素 req.meta['item'] = m_item next_requests.append(req) for req in next_requests: yield req
def parse(self, response): should_deep = True times = response.xpath('//*[@id="yun1"]/tr/td/text()[2]').extract()[1:] urls = response.xpath('//*[@id="yun1"]/tr/td/a/@href').extract() valid_child_urls = list() for time, url in zip(times, urls): try: time_now = datetime.strptime(time[3:13], '%Y-%m-%d') self.update_last_time(time_now) except: break if self.last_time is not None and self.last_time >= time_now: should_deep = False break valid_child_urls.append("http://www.bjgaj.gov.cn" + url) next_requests = list() if should_deep: # 表示有新的url # 翻页 if len(response.url) <= 42: next_url = 'http://www.bjgaj.gov.cn/web/listPage_allJfts_col1167_30_2.html' else: num = int(response.url[56]) next_url = response.url[0:56] + str(num + 1) + ".html" next_requests.append( scrapy.Request(url=next_url, callback=self.parse)) for index, temp_url in enumerate(valid_child_urls): req = scrapy.Request(url=temp_url, callback=self.parse_info) hit_time = times[index] m_item = LadItem() m_item['time'] = hit_time # 相当于在request中加入了item这个元素 req.meta['item'] = m_item next_requests.append(req) for req in next_requests: yield req
def parse_info(self, response): item = LadItem() item["city"] = "重庆" item["newsType"] = "警事要闻" item["title"] = response.xpath( '/html/body/table[4]/tr/td/table[2]/tr/td/text()').extract_first( ).strip() item["time"] = response.xpath( '/html/body/table[4]/tr/td/table[4]/tr/td/text()[1]' ).extract_first().strip()[10:21] text_list = response.xpath( '//*[@id="Zoom"]/articlepagebegin/div/div/p/span') if len(text_list) == 0: text_list = response.xpath( '//*[@id="Zoom"]/articlepagebegin/div/div/p') if len(text_list) == 0: text_list = response.xpath( '//*[@id="Zoom"]/articlepagebegin/p/span') if len(text_list) == 0: text_list = response.xpath( '//*[@id="Zoom"]/articlepagebegin/p/font/text()') self.lag = 1 if len(text_list) == 0: text_list = response.xpath('//*[@id="Zoom"]/articlepagebegin/p') if self.flag == 1: for str_slt in text_list: if str_slt.extract() is None: self.text = self.text else: self.text = self.text + str_slt.extract() self.flag = 0 else: for str_slt in text_list: if str_slt.xpath('text()').extract_first() is None: self.text = self.text else: self.text = self.text + str_slt.xpath( 'text()').extract_first() item["text"] = self.text self.text = "" yield item
def parse_info(self, response): item = LadItem() item["city"] = "南京" item["newsType"] = "治安播报" item["title"] = response.xpath('/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/center/table[2]/tr[1]/td/div/strong/text()').extract_first().strip() time_leng = len(response.xpath('/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/center/table[2]/tr[2]/td/div/text()[1]').extract_first().strip().split(']')[0].strip()) item["time"] = response.xpath('/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/center/table[2]/tr[2]/td/div/text()[1]').extract_first().strip().split(']')[0].strip()[time_leng - 10 : time_leng] text_list = response.xpath('/html/body/div/table[1]/tr/td[3]/table[5]/tr/td/table/tr/td/table/tr[2]/td/center/table[3]/tr/td/div/div/div/span/text()') for p_slt in text_list: if p_slt.extract() is None: self.text = self.text else: self.text = self.text + p_slt.extract() item["text"] = self.text self.text = "" yield item
def parse_info(self, response): item = LadItem() item["city"] = "辽宁" item["newsType"] = "警事要闻" item["title"] = response.xpath( '//*[@id="activity-name"]/text()').extract_first().strip() item["time"] = response.xpath( '//*[@id="post-date"]/text()').extract_first() text_list = response.xpath('//*[@id="js_content"]/p/span/text()') for p_slt in text_list: if p_slt.extract() is None: self.text = self.text else: self.text = self.text + p_slt.extract() item["text"] = self.text self.text = "" yield item
def parse_info(self, response): item = LadItem() item["city"] = "浙江" item["newsType"] = "警事要闻" item["title"] = response.xpath( '/html/body/table[6]/tr/td/table[2]/tr/td/table[2]/tr/td/text()' ).extract_first() time_leng = len( response.xpath( '/html/body/table[6]/tr/td/table[2]/tr/td/table[3]/tr/td/table[2]/tr/td/text()' ).extract_first().strip()) item["time"] = response.xpath( '/html/body/table[6]/tr/td/table[2]/tr/td/table[3]/tr/td/table[2]/tr/td/text()' ).extract_first().strip()[time_leng - 10:time_leng] text_list = response.xpath( '/html/body/table[6]/tr/td/table[2]/tr/td/table[3]/tr/td/table[6]/tr/td[1]/div/p/font' ) if len(text_list) == 0: text_list = response.xpath( '/html/body/table[6]/tr/td/table[2]/tr/td/table[3]/tr/td/table[6]/tr/td[1]/div/p' ) if len(text_list) == 1: if text_list.xpath('text()').extract_first() is None: self.text = self.text else: self.text = self.text + text_list.xpath( 'text()').extract_first() for str_slt in text_list: if str_slt.xpath('text()').extract_first() is None: self.text = self.text else: self.text = self.text + str_slt.xpath('text()').extract_first() item["text"] = self.text self.text = "" yield item
def parse(self, response): should_deep = True times = response.xpath( '//*[@class="article_list"]/li/span/text()').extract() urls = response.xpath( '//*[@class="article_list"]/li/a/@href').extract() valid_child_urls = list() for time, url in zip(times, urls): try: time_now = datetime.strptime(time, '%Y-%m-%d') self.update_last_time(time_now) except: break if self.last_time is not None and self.last_time >= time_now: should_deep = False break valid_child_urls.append("http://www.qhga.gov.cn" + url) next_requests = list() for index, temp_url in enumerate(valid_child_urls): req = scrapy.Request(url=temp_url, callback=self.parse_info) hit_time = times[index] m_item = LadItem() m_item['time'] = hit_time # 相当于在request中加入了item这个元素 req.meta['item'] = m_item next_requests.append(req) for req in next_requests: yield req
def parse_info(self, response): item = LadItem() item["city"] = "北京" item["newsType"] = '警事要闻' item["title"] = response.xpath( '/html/body/table[3]/tr/td/table[2]/tr/td[3]/table/tr/td/table/tr[2]/td/table/tr[1]/td/font/b/text()' ).extract_first() item["time"] = response.xpath( '/html/body/table[3]/tr/td/table[2]/tr/td[3]/table/tr/td/table/tr[2]/td/table/tr[2]/td/text()' ).extract_first().split('www.bjgaj.gov.cn')[1].strip() text_list = response.xpath('//*[@id="articleContent"]/p') for p_slt in text_list: if p_slt.xpath('text()').extract_first() is None: self.text = self.text else: self.text = self.text + p_slt.xpath('text()').extract_first() item["text"] = self.text self.text = "" yield item
def parse_info(self, response): item = LadItem() item["newsType"] = '警事要闻' item["title"] = response.xpath( '/html/body/table[4]/tr/td/table/tr[2]/td/table/tr/td/table/tr[1]/td/div/span/text()' ).extract_first() item["time"] = response.xpath( '/html/body/table[4]/tr/td/table/tr[2]/td/table/tr/td/table/tr[4]/td/div/text()[2]' ).extract_first().strip() text_list = response.xpath( '/html/body/table[4]/tr/td/table/tr[2]/td/table/tr/td/table/tr[4]/td/table[1]/tr[2]/td/span/div/div/div/p' ) if text_list.xpath('text()').extract_first() is None: self.text = self.text else: self.text = self.text + text_list.xpath('text()').extract_first() item["text"] = self.text self.text = "" yield item