def parse(self, response): for i in range(1, self.endPageNum): form_data = { "page.currentPage": str(i), "page.perPageSize": "20", "noticeBean.companyName": "", "noticeBean.title": "", "noticeBean.startDate": "", "noticeBean.endDate": "", } response = requests.post(self.tmpl_url, headers=self.headers, data=form_data) res = scrapy.Selector(text=response.text) li = res.xpath('//table[@class="jtgs_table"]//tr') article_tmp_url = 'https://b2b.10086.cn/b2b/main/viewNoticeContent.html?noticeBean.id={0}' for l in li[1:]: item = BiddinginfospiderItem() a = l.xpath(".//a") id = l.xpath('@onclick').get()[14:-2] href = article_tmp_url.format(id) title = a.xpath('.//text()').get() item.update( title=title, href=href, ) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) li = response.xpath('//ul[@class="newslist"]//a') for a in li: item = BiddinginfospiderItem() title = a.xpath("..//h1").xpath('normalize-space(string(.))').get() href = a.xpath('.//@href').get() code = a.xpath( './/ul[@class="newsinfo"]//li[1]//span//text()').get() t = a.xpath('.//div[@class="newsDate"]').xpath( 'normalize-space(string(.))').get() if t: t = t.replace(" ", "").replace("/", "-") ctime = t[:3] + "-" + t[4:] item.update( code=code, industry=self.industry, category=self.category, title=title, ctime=ctime, href=href, ) # print(item) yield item
def parse_page(self, response): if not response: return BiddinginfospiderItem() print('request_url= ', response.request.url) body = json.loads(str(response.body, "utf-8")) li = body.get("data") print("Num :", len(li)) for l in li: item = BiddinginfospiderItem() sheng = l.get('districtShow') shiQu = l.get('platformName') shi = self.getSHI(shiQu) href = l.get("url"), if isinstance(href, tuple): href = href[0] print("href is,", href) # href = href.replace("a", "b") item.update( city=sheng + "-" + shi if shi else sheng, title=l.get("title"), ctime=l.get("timeShow"), category=l.get("classifyShow"), href=href, industry=l.get("tradeShow"), ) print("ITEM IS") # print(item) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) body = json.loads(str(response.body, "utf-8")) li = body.get("obj") print(len(li)) for l in li: item = BiddinginfospiderItem() title = l.get("PROJECTNAME") ctime = l.get("RECEIVETIME") category = l.get("TABLENAME") code = l.get("PROJECTCODE") url = l.get("URL", "") + "&id=" id = l.get("ID", "") href = response.urljoin("?getNoticeDetail&url=" + url + id) print(href) item.update( category=self.category_dict[category], title=title, ctime=ctime, href=href, code=code ) # yield scrapy.Request(method="GET", url=href, dont_filter=True, callback=self.parse_item, # meta={'item': item}) yield item
def parse_page(self, response): li_lst = response.xpath('//div[@class="filter-content"]/ul/li') for l in li_lst: item = BiddinginfospiderItem() a = l.xpath('./a') title = a.xpath('.//@title').extract_first() href = response.urljoin(a.xpath('.//@href').extract_first()) ctime = self.get_ctime(a.xpath('.//span[@class="time"]//text()')) item.update( title=title, ctime=ctime, href=href, ) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) li = response.xpath('//li[@class="now-hd-items clearfix"]') for l in li: item = BiddinginfospiderItem() a = l.xpath('./a') title = a.xpath('.//@title').extract_first() href = response.urljoin(a.xpath('.//@href').extract_first()) ctime = self.get_ctime(l.xpath('.//span//text()')) item.update( title=title, ctime=ctime, href=href, ) # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item}) yield item
def parse(self, response): a = response.xpath('//div[@class="W750 Right"]//li//a') for a1 in a: item = BiddinginfospiderItem() href = response.urljoin(a1.xpath('.//@href').extract_first()) title = a1.xpath(".//text()").extract_first().strip() ctime = a1.xpath('..//..//span//text()').extract_first() city = '南方电网' item.update( href=href, title=title, ctime=ctime, city=city ) yield item
def parse_page(self, response): res = scrapy.Selector(response) li = res.xpath('//div[@class="titlecss"]') for l in li: item = BiddinginfospiderItem() a = l.xpath(".//a") title = a.xpath('.//@title').get() href = response.urljoin(a.xpath('.//@href').get()) ctime = self.get_ctime(l.xpath('../following-sibling::td[1]//text()')) item.update( title=title, href=href, ctime=ctime, ) # print(item) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) li = response.xpath('//div[@class="list_service"]//tr') for l in li: item = BiddinginfospiderItem() a = l.xpath('.//a') title = a.xpath('.//@title').extract_first() href = response.urljoin(a.xpath('.//@href').extract_first()) ctime = self.get_ctime(l.xpath(".//td[2]//text()")) item.update( category=self.category, title=title, ctime=ctime, href=href, ) # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item}) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) li = response.xpath('//li[@name="li_name"]//a') for a in li: item = BiddinginfospiderItem() title = a.xpath('@title').get() href = response.urljoin(a.xpath('.//@href').get()) ctime = self.get_ctime(a.xpath('.//em[1]//text()')) item.update( ctime=ctime, industry=self.industry, category=self.category, title=title, href=href, ) # print(item) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) li = response.xpath('//ul[@class="ewb-news-items"]//li') for l in li: item = BiddinginfospiderItem() a = l.xpath('.//a') title = a.xpath('.//@title').extract_first() href = response.urljoin(a.xpath('.//@href').extract_first()) ctime = self.get_ctime(a.xpath('.//span//text()')) item.update( city="湖北", title=title, ctime=ctime, href=href, ) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) li = response.xpath('//li[@class="list-item"]') for l in li: item = BiddinginfospiderItem() a = l.xpath('./a') title = a.xpath('.//@title').extract_first() href = response.urljoin(a.xpath('.//@href').extract_first()) ctime = self.get_ctime(a.xpath('.//span//text()')) item.update( industry=self.industry, category=self.category, title=title, ctime=ctime, href=href, ) yield item
def parse_page(self, response): print(response.request.url) a_lst = response.xpath('//table[@class="wsbs-table"]//a') for a in a_lst: item = BiddinginfospiderItem() title = a.xpath('.//text()').extract_first() href = response.urljoin(a.xpath('.//@href').extract_first()) ctime = self.get_ctime(a.xpath('../../td//text()')) item.update( category=self.category, industry=self.industry, title=title, ctime=ctime, href=href, city="广东", ) yield item
def parse_start_url(self, response): li = response.xpath('//div[@class="lb-link"]/ul//li') for l in li: item = BiddinginfospiderItem() a = l.xpath('.//a') title = a.xpath("@title").get() href = a.xpath("@href").get() ctime = self.get_ctime(l.xpath('.//span[@class="bidDate"]//text()')) item.update( industry=self.industry, category=self.category, title=title, ctime=ctime, href=href, ) # print(item) yield item
def parse_page(self, response): res = Selector(response) li_lst = res.xpath('//tr[@class="gridview1_RowStyle"]') for l in li_lst: item = BiddinginfospiderItem() a = l.xpath(".//a") title = a.xpath('.//text()').extract_first() href = response.urljoin(a.xpath('.//@href').extract_first()) c = l.xpath('.//td[@class="gridview_RowTD"][last()]') ctime = self.get_ctime(c) item.update( category=self.category, industry=self.industry, title=title, ctime=ctime, href=href, ) # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item}) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) li = json.loads(str(response.body, "utf-8")) data = li.get("rows") article_tmp_url = "http://epp.ctg.com.cn/infoview/?fileId={0}&openFor=ZBGG&typeFor=undefined" for li in data: item = BiddinginfospiderItem() title = li.get('TITLE') ctime = li.get('CREATED_TIME') id = li.get('ARTICLE_ID') href = article_tmp_url.format(id) item.update( industry=self.industry, category=self.category, title=title, ctime=ctime, href=href, ) # print(item) yield item
def parse_page(self, response): li_lst = response.xpath( '//div[@class="abstract-box mg-t25 ebnew-border-bottom mg-r15"]') for l in li_lst: item = BiddinginfospiderItem() a = l.xpath('.//a') title = a.xpath('.//@title').extract_first() href = response.urljoin(a.xpath('.//@href').extract_first()) ctime = self.get_ctime(l.xpath('.//i[2]//text()')) city = l.xpath( './/div[@class="abstract-content-items fl pd-l15 pd-t20 pd-b20 width-50"][2]//p[2]//span[2]//text()' ).extract_first() item.update( title=title, ctime=ctime, href=href, city=city, ) yield item
def parse_page(self, response): res = scrapy.Selector(response) article_tmp_url = 'http://ec.ccccltd.cn/PMS/gysCggg.shtml?id={0}' li = res.xpath('//td[@class="listCss"]//a') for a in li: item = BiddinginfospiderItem() title = a.xpath('normalize-space(string(.))').get() x = "".join( a.xpath('.//@href').get().replace("\\r", "").replace( "\\n", "").split())[23:-3] href = article_tmp_url.format(x) ctime = self.get_ctime( a.xpath('../following-sibling::td[1]//text()')) item.update( title=title, href=href, ctime=ctime, ) # print(item) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) li = response.xpath('//table[@class="table_text"]//tr') for l in li[1:]: item = BiddinginfospiderItem() a = l.xpath('.//a') title = a.xpath('.//@title').extract_first() href = response.urljoin(a.xpath('.//@href').extract_first()) ctime = self.get_ctime(a.xpath('.//td[5]//span//text()')) industry = l.xpath(".//td[2]//span//text()").extract_first() city = l.xpath(".//td[3]//span//@title").extract_first() item.update( industry=industry, title=title, ctime=ctime, href=href, city=city, ) # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item}) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) li = response.xpath('//a[@class="gccon_title"]') print(li.getall()) for a in li: item = BiddinginfospiderItem() title = a.xpath('normalize-space(string(.))').get() href = a.xpath('.//@href').get() t = a.xpath('../span[@class="gc_date"]').xpath( 'normalize-space(string(.))') ctime = self.get_ctime(t) item.update( industry=self.industry, category=self.category, title=title, ctime=ctime, href=href, ) # print(item) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) li = response.xpath('//ul[@class="article-list2"]//li') for l in li: item = BiddinginfospiderItem() a = l.xpath('.//a') title = a.xpath('normalize-space(string(.))').extract_first() href = response.urljoin(a.xpath('.//@href').extract_first()) ctime = self.get_ctime( l.xpath('.//div[@class="list-times"]//text()')) other_data = l.xpath('.//div[@class="list-t2"]').xpath( 'normalize-space(string(.))').extract() city = other_data[0].split(":")[1] category = other_data[2].split(":")[1] item.update( city=city, category=category, title=title, ctime=ctime, href=href, ) # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item}) yield item
def parse_start_url(self, response): print('request_url= ', response.request.url) ul = response.xpath("//tr[@align='left']") for i in range(1, len(ul)): item = BiddinginfospiderItem() el = ul[i].xpath(".//td") li_a = el[2].xpath('.//a') code = el[1].xpath('normalize-space(string(.))').extract_first() title = li_a.xpath('@title').extract_first() ctime = el[3].xpath('normalize-space(string(.))').extract_first() param = li_a.xpath('@onclick').extract_first() param_lst = self.get_re("\'(\d+)\'", param) href = self.article_tmp.format(param_lst[0], param_lst[1]) item.update( code=code, title=title, ctime=ctime, href=href, ) # req = scrapy.Request(response.urljoin(href), callback=self.parse_item, dont_filter=True, # meta={'item': item}) yield item