def parse_item(self, response): try: hxs = HtmlXPathSelector(response) item = NewsItem() item['link'] = response.url item['title'] = hxs.select("//head/title/text()").extract()[0] time_raw = item['link'].split('/') item['date'] = time_raw[-4] + "-" + time_raw[-3] + "-" + time_raw[-2] (province, city) = get_province_city(item['title']) item["province"] = province item["city"] = city item['content'] = '' content_list = hxs.select("//div[@id='articlecontent']/p/text()").extract() if content_list ==[]: content_list = hxs.select("//div[@id='articlecontent']/P/text()").extract() if content_list ==[]: content_list = hxs.select("//div[@id='articlecontent']/FONT/text()").extract() if content_list ==[]: content_list = hxs.select("//div[@id='articlecontent']/text()").extract() if content_list ==[]: content_list = hxs.select("//font[@id='zoom']/text()").extract() if content_list ==[]: content_list = hxs.select("//div[@id='zoom']/text()").extract() if content_list: item['content'] = ''.join(content_list) else: print 'None' if item['content'] == '': item['content'] = "null" yield item time.sleep(1) except: pass
def parse_item(self, response): try: hxs = HtmlXPathSelector(response) item = NewsItem() item['link'] = response.url item['title'] = hxs.select("//head/title/text()").extract()[0] time_raw = hxs.select("//div[@class='about']/span/text()").extract()[0].split() item['date'] = time_raw[0] (province, city) = get_province_city(item['title']) item["province"] = province item["city"] = city item['content'] = '' content_list = hxs.select("//dl[@id='content']/dd//p/text()").extract() #if content_list ==[]: # content_list = hxs.select("//div[@id='endtext']//FONT/text()").extract() if content_list: item['content'] = ''.join(content_list) else: print 'None' if item['content'] == '': item['content'] = "null" yield item time.sleep(random()) except: pass
def parse_item(self, response): try: hxs = HtmlXPathSelector(response) item = NewsItem() item['link'] = response.url item['title'] = hxs.select("//head/title/text()").extract()[0] time_raw = item['link'].split('/') item['date'] = time_raw[-2] (province, city) = get_province_city(item['title']) item["province"] = province item["city"] = city item['content'] = '' content_list = hxs.select("//div[@id='content']/p/text()").extract() #if content_list ==[]: # content_list = hxs.select("//div[@class='yjl_fx168_article_zhengwen']/div[@class='TRS_Editor']//p/text()").extract() if content_list: item['content'] = ''.join(content_list) else: print 'None' if item['content'] == '': item['content'] = "null" yield item time.sleep(0.5) except: pass
def parse_item(self, response): try: hxs = HtmlXPathSelector(response) item = NewsItem() item['link'] = response.url item['title'] = hxs.select("//head/title/text()").extract()[0] time_raw = hxs.select("//font[@style='FONT-SIZE: 9pt; COLOR: #666666']/text()").extract()[0].split() item['date'] = time_raw[1][:4] + "-" + time_raw[1][5:7] + "-" + time_raw[1][8:-1] (province, city) = get_province_city(item['title']) item["province"] = province item["city"] = city item['content'] = '' content_list = hxs.select("//p/text()").extract() #if content_list ==[]: # content_list = hxs.select("//div[@id='endtext']//FONT/text()").extract() if content_list: item['content'] = ''.join(content_list) else: print 'None' if item['content'] == '': item['content'] = "null" yield item time.sleep(random()) except: pass
def parse_item(self, response): try: hxs = HtmlXPathSelector(response) item = NewsItem() item['link'] = response.url item['title'] = hxs.select("//head/title/text()").extract()[0] time_raw = item['link'].split('/') item['date'] = time_raw[-3] + "-" + time_raw[-2] (province, city) = get_province_city(item['title']) item["province"] = province item["city"] = city item['content'] = '' content_list = hxs.select("//p/text()").extract() #if content_list ==[]: # content_list = hxs.select("//div[@class='yjl_fx168_article_zhengwen']/div[@class='TRS_Editor']//p/text()").extract() if content_list: item['content'] = ''.join(content_list) else: print 'None' if item['content'] == '': item['content'] = "null" yield item time.sleep(0.5) except: pass
def parse_item(self, response): try: hxs = HtmlXPathSelector(response) item = NewsItem() item['link'] = response.url item['title'] = hxs.select("//head/title/text()").extract()[0] time_raw = hxs.select( "//font[@style='FONT-SIZE: 9pt; COLOR: #666666']/text()" ).extract()[0].split() item['date'] = time_raw[1][:4] + "-" + time_raw[1][ 5:7] + "-" + time_raw[1][8:-1] (province, city) = get_province_city(item['title']) item["province"] = province item["city"] = city item['content'] = '' content_list = hxs.select("//p/text()").extract() #if content_list ==[]: # content_list = hxs.select("//div[@id='endtext']//FONT/text()").extract() if content_list: item['content'] = ''.join(content_list) else: print 'None' if item['content'] == '': item['content'] = "null" yield item time.sleep(random()) except: pass
def parse_item(self, response): try: hxs = HtmlXPathSelector(response) item = NewsItem() item['link'] = response.url item['title'] = hxs.select( "//div[@class='contentbody']/div[@class='contentleft']/div[@class='NewsContent']/h3/text()" ).extract()[0] time_raw = item['link'].split('/')[-2] if len(time_raw) == 6: item['date'] = time_raw[:4] + '-' + time_raw[ -2] + '-' + time_raw[-1] if len(time_raw) == 8: item['date'] = time_raw[:4] + '-' + time_raw[ -4:-2] + '-' + time_raw[-2:] if len(time_raw) == 7: if time_raw[4] > 2: item['date'] = time_raw[:4] + '-' + time_raw[ -3] + '-' + time_raw[-2:] elif time_raw[5] == 0: item['date'] = time_raw[:4] + '-' + time_raw[ -4:-2] + '-' + time_raw[-1] else: item['date'] = time_raw[:4] + '-' + time_raw[ -3] + '-' + time_raw[-2:] (province, city) = get_province_city(item['title']) item["province"] = province item["city"] = city item['content'] = '' content_list = hxs.select( "//div[@id='content']/div[@class='nr']/p/text()").extract() #if content_list ==[]: # content_list = hxs.select("//div[@class='yjl_fx168_article_zhengwen']/div[@class='TRS_Editor']//p/text()").extract() if content_list: item['content'] = ''.join(content_list) else: print 'None' if item['content'] == '': item['content'] = "null" yield item time.sleep(0.5) except: pass
def parse_item(self, response): try: hxs = HtmlXPathSelector(response) item = NewsItem() item['link'] = response.url item['title'] = hxs.select("//head/title/text()").extract()[0] time_raw = item['link'].split('/') item['date'] = time_raw[-4] + "-" + time_raw[-3] + "-" + time_raw[ -2] (province, city) = get_province_city(item['title']) item["province"] = province item["city"] = city item['content'] = '' content_list = hxs.select( "//div[@id='articlecontent']/p/text()").extract() if content_list == []: content_list = hxs.select( "//div[@id='articlecontent']/P/text()").extract() if content_list == []: content_list = hxs.select( "//div[@id='articlecontent']/FONT/text()").extract() if content_list == []: content_list = hxs.select( "//div[@id='articlecontent']/text()").extract() if content_list == []: content_list = hxs.select( "//font[@id='zoom']/text()").extract() if content_list == []: content_list = hxs.select("//div[@id='zoom']/text()").extract() if content_list: item['content'] = ''.join(content_list) else: print 'None' if item['content'] == '': item['content'] = "null" yield item time.sleep(1) except: pass
def parse_item(self, response): try: hxs = HtmlXPathSelector(response) item = NewsItem() item['link'] = response.url item['title'] = hxs.select("//div[@class='contentbody']/div[@class='contentleft']/div[@class='NewsContent']/h3/text()").extract()[0] time_raw = item['link'].split('/')[-2] if len(time_raw) == 6: item['date'] = time_raw[:4] + '-' + time_raw[-2] + '-' + time_raw[-1] if len(time_raw) == 8: item['date'] = time_raw[:4] + '-' + time_raw[-4:-2] + '-' + time_raw[-2:] if len(time_raw) == 7: if time_raw[4] > 2: item['date'] = time_raw[:4] + '-' + time_raw[-3] + '-' + time_raw[-2:] elif time_raw[5] == 0: item['date'] = time_raw[:4] + '-' + time_raw[-4:-2] + '-' + time_raw[-1] else: item['date'] = time_raw[:4] + '-' + time_raw[-3] + '-' + time_raw[-2:] (province, city) = get_province_city(item['title']) item["province"] = province item["city"] = city item['content'] = '' content_list = hxs.select("//div[@id='content']/div[@class='nr']/p/text()").extract() #if content_list ==[]: # content_list = hxs.select("//div[@class='yjl_fx168_article_zhengwen']/div[@class='TRS_Editor']//p/text()").extract() if content_list: item['content'] = ''.join(content_list) else: print 'None' if item['content'] == '': item['content'] = "null" yield item time.sleep(0.5) except: pass