def parse_item(self, response): if 'json' in response.url: rows = json.loads(response.text)['data'] for row in rows: item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.url_entry url = row['weburl'] item['url'] = 'http://www.ixzzcgl.com' + url item['title'] = row['short_title'] publish_time = row['notice_date'] item['publish_time'] = datetime.strptime( publish_time, '%Y-%m-%d') if publish_time else None yield item else: rows = response.xpath('//div[@class="dync_state dashed_bd"][2]/a') for row in rows: item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.url_entry url = row.xpath('./@href').extract_first() item['url'] = 'http://www.ixzzcgl.com' + url item['title'] = row.xpath('./b/text()').extract_first() publish_time = row.xpath('./em/text()').extract_first() item['publish_time'] = datetime.strptime( publish_time, '%Y-%m-%d') if publish_time else None yield item
def parse_item(self, response): rows = response.xpath('//div[@class="list"]') for row in rows: title = row.xpath('./div[@class="l"]/a/@title').extract_first() url = row.xpath('./div[@class="l"]/a/@href').extract_first() url = urljoin(get_base_url(response), url) publish_time = row.xpath('./div[@class="r"]/text()').re_first( '\d+-\d+-\d+') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['title'] = title item['url'] = url item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d') yield item next_url = response.xpath( '//div[@class="page"]/a[contains(text(),"下一页")]/@href' ).extract_first() if next_url: self.ips.append({ 'url': urljoin(get_base_url(response), next_url), 'ref': response.url })
def parse_item(self, response): rows = response.xpath( '//*[@id="discover"]//div[@class="newslist"]/ul/li') next_url = response.xpath( '//*[@id="discover"]/div[1]//a[text()="下一页"]/@href').extract_first( ) last_url = response.xpath( '//*[@id="discover"]/div[1]//a[text()="末页"]/@href').extract_first( ) for row in rows: url = row.xpath('./div[2]/h3/a/@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath('./div[2]/h3/a/text()').extract_first() publish_year = row.xpath('./div[1]/span[2]/text()').extract_first() publish_day = row.xpath('./div[1]/span[1]/text()').extract_first() publish_time = str(publish_year) + '-' + str(publish_day) publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item if next_url and next_url != last_url: next_url = urljoin(get_base_url(response), next_url) self.ips.append({'url': next_url, 'ref': response.url})
def parse_item(self, response): rows = response.xpath('//table[@class="views-view-grid cols-1"]//tr') for row in rows: url = row.xpath('./td/span/span/a/@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath('./td/span/span/a/text()').extract_first().replace('\t', '').replace('\r', '').replace('\n', '') publish_time = row.xpath('./td/div/div/span/@content').extract_first() publish_time = publish_time[0:10] publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item next_url = response.xpath('//ul[@class="pager"]/li/a[contains(text(),"下一页")]/@href').extract_first() if next_url: next_url = urljoin(get_base_url(response), next_url) self.ips.append({ 'url': next_url, 'ref': response.url })
def parse_item(self, response): ext = response.meta['ext'] page = int(ext['page']) rows = json.loads(response.text) total_pages = int(rows['totalPages']) rows = rows['recordList'] for row in rows: url = row['href'] url = urljoin(get_base_url(response), url) title = row['subject'] publish_time = row['createTime'] publish_time = datetime.strptime(publish_time, '%Y-%m-%d %H:%M:%S') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item if page < total_pages: url = 'http://www.nanhua.net/jSearch/queryNewsListByTypeForJson.shtm?site=newnanhua&type=201201&limit=16&start=' url = url + str(page + 1) self.ips.append({ 'url': url, 'ref': response.url, 'ext': { 'page': str(page + 1) } })
def parse_item(self, response): rows = response.xpath('//div[@class="simu-site-list"]/ul/a') for row in rows: url = row.xpath('./@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath( './li/div/div[1]/text()').extract_first().replace( '\t', '').replace('\r', '').replace('\n', '') publish_time = row.xpath( './li/div/div[3]/div[2]/text()').extract_first() publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item next_url = response.xpath( '//div[@class="simu-site-pagination"]/ul/li/a[contains(text(),"下一页")]/@href' ).extract_first() if next_url != 'javascript:;': next_url = urljoin(get_base_url(response), next_url) self.ips.append({'url': next_url, 'ref': response.url}) yield self.request_next()
def parse_list(self, response): notices = response.xpath('/html/body/div[4]//div[@class="trends"]/dl') for notice in notices: url = notice.xpath('./dd/a/@href').extract_first().strip() url = urljoin(get_base_url(response), url) title = notice.xpath('./dd/a/h3/text()').extract_first() publish_time_year = notice.xpath('./dt/text()').extract_first() publish_time_day = notice.xpath('./dt/b/text()').extract_first() publish_time = publish_time_year + '-' + publish_time_day item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = datetime.strptime(publish_time.strip(), '%Y-%m-%d') yield item next_page = response.xpath( '/html/body/div[4]/div[2]/div[@class="paging"]//a[text()="下一页"]/@href' ).extract_first() if next_page is not None and next_page != 'javascript:': next_url = urljoin(get_base_url(response), next_page) self.lps.append({ 'url': next_url, 'ref': response.url, })
def parse_item(self, response): rows = response.xpath('//ul[@class="cmnList"]/li') for row in rows: url = row.xpath('./a/@href').extract_first() if url == 'javascript:void(0)': # <a href="javascript:void(0)" onclick="getPdf('/home','/projectText/201510201386/1010/101002/144643001676888.pdf')"> # r",\'(\S+)\'" 意思是: 以逗号开始的截取后面的单引号里面的多个非空字符,单引号里面的东西用()定位 on_click = row.xpath('./a/@onclick').re_first(r",\'(\S+)\'") on_click = str('/uploads/fore') + on_click url = on_click url = urljoin(get_base_url(response), url) title = row.xpath('./a/text()').extract_first().replace( '\t', '').replace('\r', '').replace('\n', '') publish_time = row.xpath('./span/text()').extract_first() publish_time = datetime.strptime(publish_time, '%Y/%m/%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item next_url = response.xpath( '//div[@class="pdtPaging"]/a[text()="下一页"]/@href').extract_first() if next_url: next_url = urljoin(get_base_url(response), next_url) self.ips.append({'url': next_url, 'ref': response.url})
def parse_item(self, response): rows = response.xpath('//ul[@class="jList fadeUp"]/li') for row in rows: url = row.xpath('./a[1]/@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath('./a[1]/text()').extract_first() publish_time = row.xpath('./a[2]/text()').extract_first() if publish_time == str( '0201-12-24'): # 特殊情况存在: #网页时间格式有错误: 0201-12-24 publish_time = str('2010-12-24') publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item next_url = response.xpath( '//div[@class="pages fadeUp"]/a[text()=">"]/@href').extract_first( ) if next_url: next_url = urljoin(get_base_url(response), next_url) self.ips.append({'url': next_url, 'ref': response.url}) yield self.request_next()
def parse_item(self, response): rows = response.xpath('//div[@class="news_list"]/ul/li') for row in rows: url = row.xpath('./a/@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath('./a/text()').extract_first().strip().replace( '\t', '').replace('\r', '').replace('\n', '') publish_time = row.xpath( './span/text()').extract_first().strip().replace( '\t', '').replace('\r', '').replace('\n', '') publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item next_url = response.xpath( '//div[@class="page"]/a[text()="下一页"]/@href').extract_first() if next_url: next_url = urljoin(get_base_url(response), next_url) if response.url != next_url: self.ips.append({ 'url': next_url, })
def parse_item(self, response): rows = response.xpath('//div[@class="news_list team_box"]/ul/li') for row in rows: url = row.xpath('./a/@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath('./a/@title').extract_first().replace('\t', '').replace('\r', '').replace('\n', '') publish_time = row.xpath('./a/em/text()').extract_first()#2018-05-28 publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item next_url = response.xpath('//div[@class="page_list f_r"]/div/a[contains(text(),"下一页")]/@href').extract_first() if next_url: next_url = urljoin(get_base_url(response), next_url) self.ips.append({ 'url': next_url, 'ref': response.url })
def parse_list(self, response): noticeList = response.xpath( '/html/body/div/div[2]/div[2]/div[3]//div[@class="zp11_1 mg2 xi14"]' ) next_page = response.xpath( '/html/body/div/div[2]/div[2]/div[3]/div[@class="fy cen xi14"]/a[text()="下一页"]/@href' ).extract_first() cur_page = int( response.xpath( '/html/body/div/div[2]/div[2]/div[3]/div[@class="fy cen xi14"]/span/text()' ).extract_first()) next_page_index = int(re.search(r'page=(\d+)', next_page).group(1)) for notice in noticeList: noticeLink = notice.xpath('./a/@href').extract_first().strip() title = notice.xpath('./a/text()').extract_first() publish_time = notice.xpath('./span/text()').extract_first() item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = urljoin(get_base_url(response), noticeLink) item['title'] = title item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d') yield item if next_page and cur_page != next_page_index: next_url = urljoin(get_base_url(response), next_page) self.lps.append({ 'url': next_url, 'ref': response.url, })
def parse_item(self, response): datas = response.xpath('//div[@class="lh-160p"]/div[@class="m-b-10"]') for notice in datas[1:]: href = notice.xpath('./div[1]/a/@href').extract_first() url = urljoin(get_base_url(response), href) title = notice.xpath('./div[1]/a/text()').extract_first() publish_time = notice.xpath('./div[2]/text()').extract_first() item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d') yield item next_url = response.xpath( '//div[@class="pager"]/table/tr/td[3]/image/@onclick').re_first( '\d+') if next_url is not None: self.ips.append({ 'url': 'http://www.gwxstrust.com/cn/page.jsp?id=23&pageIndex=' + next_url, 'ref': response.url })
def parse_item(self, response): datas = json.loads(response.text) rows = datas['collection'] for row in rows: title = row['title'] url = '/news/detail/' + str(row['articleId']) url = urljoin(get_base_url(response), url) publish_time = row['publishTime'] publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item tp = datas['property']['pages'] cp = response.meta['pg'] if cp < tp: cp = cp + 1 self.ips.append({ 'url': 'http://www.szhvc.com/api/pc/message/news/paging?tagl2=REPORTS&pageSize=3&pageNum=' + str(cp) + '&_=1527477855432', 'ref': response.url, 'pg': cp })
def parse_item(self, response): datas = response.xpath('//div[contains(@class, "newsList")]') for notice in datas: href = notice.xpath( './div[@class="newsName"]/a/@href').extract_first() url = urljoin(get_base_url(response), href) title = notice.xpath( './div[@class="newsName"]/a/text()').extract_first() publish_time = notice.xpath( './div[@class="newsDate"]/text()').extract_first() item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d') yield item next_url = response.xpath( '//a[contains(text(), "下一页")]/@href').extract_first() if next_url is not None and next_url != '': url = self.entry + next_url self.ips.append({'url': url, 'ref': response.url})
def parse_list(self, response): funds = response.xpath( '//td[@class="TDbgcolor5"]/table/tbody/tr[4]/td/table/tbody/tr/td[2]/a' ) for fund in funds: url = fund.xpath('./@href').extract_first() title = fund.xpath('./text()').extract_first() publish_time = re.findall('[P0|t]+(\d{8})', url)[0] if 'pdf' not in url: self.ips.append({ 'url': urljoin(get_base_url(response), url), 'ref': response.url, }) else: item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = datetime.strptime( publish_time, '%Y%m%d') yield item tp = response.css('script').re_first(r'[1-9]\d+') pg = response.meta['pg'] next_pg = int(pg) + 1 if next_pg < int(tp): next_url = urljoin(get_base_url(response), 'index_' + str(next_pg) + '.htm') self.lps.append({ 'url': next_url, 'ref': response.url, 'pg': next_pg, })
def parse_item(self, response): rows = response.xpath('//ul[@class="xcpl"]/li[not(@class)]') for row in rows: url = row.xpath('./span[@class="xmcn_ry"]/a/@href').extract_first() if 'javascript:void(0);' != url: url = urljoin(get_base_url(response), url) else: # 加密公告取当前所在页面地址 url = response.url title = row.xpath( './span[@class="xmcn_ry"]/a/text()').extract_first() publish_time = row.xpath( './span[@class="xmcn_njy"]/text()').re_first('\d+-\d+-\d+') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d') yield item next_url = response.xpath( '//div[@class="pages"]/a[contains(text(),"下一页")]/@href' ).extract_first() if 'javascript:void(0);' != next_url: self.ips.append({ 'url': urljoin(get_base_url(response), next_url), 'ref': response.url })
def parse_item(self, response): rows = response.xpath('//div[@class="cont_rig"]/ul/li') for row in rows: url = row.xpath('./a/@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath( './a/span[@class="title"]/text()').extract_first() publish_time = row.xpath( './a/span[@class="time"]/text()').re_first('\d+-\d+-\d+') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d') yield item tp = response.xpath('//script/text()').re_first( 'var PAGE_COUNT = (\d+)') if tp: pg = response.meta['pg'] + 1 if pg <= int(tp): self.ips.append({ 'url': 'http://www.ajxt.com.cn/ajxt/gywm/cpgg/index_{0}.shtml?page={1}' .format(pg - 1, pg), 'ref': response.url, 'pg': pg })
def parse_item(self, response): datas = response.xpath( '/html/body/section[2]/div//ul[@class="bulletin_main clearfix"]/li' ) next_url = response.xpath( '/html/body/section[2]/div//div[@class="page"]/div/a[last()]/@href' ).extract_first() for notice in datas: href = notice.xpath('./a/@href').extract_first() url = urljoin(get_base_url(response), href) title = notice.xpath('./a/span/text()').extract_first().strip() publish_time = notice.xpath('./a/time/text()').extract_first() item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d') yield item if next_url is not None and next_url != '': url = self.entry + next_url self.ips.append({'url': url, 'ref': response.url}) yield self.request_next()
def parse_item(self, response): line = response.text titles = re.findall(r'\\"\\">(\S+\s?\S+)\s*<\\/span>', line) dates = re.findall(r' text-right\\">\\n\s+(\d+年\d+月\d+日)\\n\s+<', line) urls = re.findall(r'data-remote=\\"true\\" href=\\"(\S+)\\">', line) next_url = re.search(r'href=\\"(/announcements\?\S+)\\">下一页', line) if next_url: next_url = next_url.group(1) for title in titles: url = urls.pop(0) url = urljoin(get_base_url(response), url) publish_time = dates.pop(0) publish_time = datetime.strptime(publish_time, '%Y年%m月%d日') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title # item['publish_time'] = publish_time yield item #self.log(item) if next_url: next_url = urljoin(get_base_url(response), next_url) csrf_token = response.xpath('/html/head/meta[@name="csrf-token"]/@content').extract_first() self.ips.append({ 'url': next_url, 'ref': response.url, 'X-Requested-With': 'XMLHttpRequest', 'headers': { 'X-CSRF-Token': csrf_token, 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01' }})
def parse_item(self, response): rows = response.xpath('//div[@class="snowview_bot"]/ul/li') for row in rows: title = row.xpath('./a/text()').extract_first() url = row.xpath('./a/@href').extract_first() url = urljoin(get_base_url(response), url) publish_time = row.xpath('./span/text()').extract_first() item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['title'] = title item['url'] = url item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d') yield item tp = response.xpath( '//div[@class="left tx_fye"]/a[contains(text(),"末页")]').re_first( '\d+') pg = response.meta['pg'] if pg < int(tp): pg = pg + 1 next_url = 'http://www.snowlightcapital.cn/moreNews?pageNum=' + tp + '&pageNo=' + str( pg) self.ips.append({'url': next_url, 'ref': response.url, 'pg': pg})
def parse_item(self, response): rows = response.xpath('//div[@class="alist2"]/div[not(@class)]') for row in rows: url = row.xpath('./div[1]/a/@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath('./div[1]/a/text()').extract_first() publish_time = row.xpath('./div[2]/text()').extract_first() publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item next_page = response.xpath( '//table[@class="w-all h-30"]/tr/td[3]/image[@class]/@onclick' ).re_first('\d+') next_url = 'http://www.zhongtaitrust.com/cn/fortune/products/info.jsp?pageIndex=' + str( next_page) if next_page: self.ips.append({ 'url': next_url, })
def parse_item(self, response): datas = response.xpath('//ul[@id="ggul"]/li') for notice in datas: href = notice.xpath('./a/@href').extract_first() url = urljoin(get_base_url(response), href) title = notice.xpath('normalize-space(./a/text())').extract_first() publish_time = notice.xpath('./span/text()').extract_first() item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = datetime.strptime(publish_time, '%Y/%m/%d') yield item tp = response.xpath( '//div[@class="page"]/table/tr/td/script').extract_first() tp = re.findall('var countPage = (\d+)', tp)[0] cp = response.meta['pg'] if (cp + 1) < int(tp): cp = cp + 1 next_url = urljoin(get_base_url(response), 'index_' + str(cp) + '.html') self.ips.append({'url': next_url, 'ref': response.url, 'pg': cp})
def parse_item(self, response): ext = response.meta['ext'] report_type = int(ext['report_type']) if report_type == 1: rows = response.xpath( '/html/body/div[2]/div[2]//div[@class="rightdiv_n"]/ul/li') next_url = response.xpath( '//*[@id="page"]/div[2]/ul/a[last()]/@href').extract_first() else: rows = response.xpath('//li') next_url = response.xpath( '//*[@id="page"]//a[text()="下一页"]/@href').extract_first() for row in rows: url = row.xpath('./a/@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath('./a//text()').extract_first().strip() publish_time = row.xpath('./span/text()').extract_first() publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item if next_url and next_url != 'javascript:void(0);': next_url = urljoin(get_base_url(response), next_url) self.ips.append({ 'url': next_url, 'ref': response.url, 'ext': { 'report_type': str(report_type) } })
def parse_item(self, response): rows = response.xpath('//div[@class="news01"]/ul/li') for row in rows: url = row.xpath('./div[@class="p2"]/a/@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath('./div[@class="p2"]/a/text()').extract_first() publish_time = '2018-' + row.xpath( './div[@class="p1"]/text()').re_first('\d+-\d+') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d') yield item tp = response.xpath( '//div[@class="page"]/span/a[contains(text(),"尾页")]/@href' ).re_first('page=(\d+)') if tp: pg = response.meta['pg'] + 1 if pg <= int(tp): self.ips.append({ 'url': 'http://www.hexifund.com/info.php?class_id=102102&page={0}' .format(pg), 'ref': response.url, 'pg': pg })
def parse_item(self, response): ext = response.meta['ext'] page = int(ext['page']) total_page = int( response.xpath('//div[@class="digg"]/a[text()=">"]/@href'). re_first(r'page=(\d+)')) notices = response.xpath('//td[@class="newslist"]') years = response.xpath('//td[@class="newslist2"]/text()').extract() for row in notices: url = row.xpath('./a/@href').extract_first() url = urljoin(get_base_url(response), url) title = row.xpath('./a//text()').extract_first() publish_time = years.pop(0) publish_time = datetime.strptime(publish_time, '%Y-%m-%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item if page < total_page: self.ips.append({ 'url': 'http://www.urich.cn/news.asp?page=' + str(page + 1), 'ref': response.url, 'ext': { 'page': str(page + 1) } })
def parse_list(self, response): noticeList = response.xpath('//div[@class="r_news"]/ul/li') for notice in noticeList: noticeLink = notice.xpath('./a/@href').extract_first().strip() noticeLink = urljoin(get_base_url(response), noticeLink) title = notice.xpath('./a/text()').extract_first() publish_time = notice.xpath('./em/text()').extract_first() item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = noticeLink item['title'] = title item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d') yield item next_url = response.xpath( '//div[@class="quotes"]/a[contains(text(), "下一页")]/@href' ).extract_first() if next_url is not None: self.lps.append({ 'url': urljoin(get_base_url(response), next_url), 'ref': response.url })
def parse_item(self, response): ext = response.meta['ext'] page = int(ext['page']) fund_id = ext['fund_id'] if response.text: rows = json.loads(response.text) for row in rows: item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry url = row['url'] title = row['title'] publish_time = row['displayTime'] publish_time = datetime.strptime(publish_time, '%Y.%m.%d') item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item url = 'http://www.cqitic.com/more/' + fund_id + '_' + str( page + 1) + '_20.shtml' self.ips.append({ 'url': url, 'ref': response.url, 'ext': { 'page': str(page + 1), 'fund_id': fund_id } })
def parse_item(self, response): ext = response.meta['ext'] page = int(ext['page']) total_page = re.search(r'pagecount\:(\d+),', response.text) if total_page: total_page = int(total_page.group(1)) else: total_page = 0 rows = response.xpath('//ul/li/a') for row in rows: title = row.xpath('./strong/text()').extract_first() url = row.xpath('./@href').extract_first() publish_time = row.xpath('./span/text()').extract_first() publish_time = datetime.strptime(publish_time, '%Y.%m.%d') item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = urljoin(get_base_url(response), url) item['title'] = title item['publish_time'] = publish_time yield item if page < total_page: form = {'curPage': str(page + 1), 'numPerPage': '10', 'type': '1'} self.ips.append({ 'url': response.url, 'form': form, 'ext': { 'page': str(page + 1) }, 'ref': response.url })
def parse_item(self, response): rows = response.css('ul.mid_ul_span li') for row in rows: url = row.xpath('./p[1]/a/@href').extract_first()#获取路径 url = urljoin(get_base_url(response), url)#拼接绝对路径 title = row.xpath('./p[1]/a/span/text()').extract_first().strip().replace('\t', '').replace('\r', '').replace('\n', '')#标题 publish_time = row.xpath('./p[2]/span/text()').extract_first().strip().replace('\t', '').replace('\r', '').replace('\n', '')#时间格式 publish_time = datetime.strptime(publish_time, '%Y.%m.%d')#转换时间格式 item = GGFundNoticeItem() item['sitename'] = self.sitename item['channel'] = self.channel item['url_entry'] = self.entry item['url'] = url item['title'] = title item['publish_time'] = publish_time yield item next_url = response.xpath('//ul[@class="pagination mid_page"]/li/a[text()="下一页"]/@href').extract_first() if next_url: next_url = urljoin(get_base_url(response), next_url) self.ips.append({ 'url': next_url, 'ref': response.url }) yield self.request_next()