def parse_credit_outlook(self, response): tds = response.xpath(".//div[@class='result-details']/div") for td in tds: article = NormalbankItem() article['link'] = BASE_URL + td.xpath('.//a/@href').get() article['title'] = td.xpath(".//a/@data-analytics-link").get() article['push_date'] = td.xpath(".//a/@data-analytics-link").get() article['text'] = 'tag-end' yield article
def parse_report(self,response): tds = response.xpath("//li[@class='ds-artifact-item even'] | //li[@class='ds-artifact-item odd']") for td in tds: item = NormalbankItem() item["title"] = td.xpath(".//h4/a/text()").extract_first() item["name"] = td.xpath(".//div[@class='content author-info']/span/a/text()").extract_first() item["text"] = td.xpath(".//div[@class='artifact-info hidden-md hidden-lg']/span/a/text()").extract_first() item['link'] = td.xpath(".//span/a/@href").extract_first() yield item
def parse_artificial_intelligence(self, response): tds = response.xpath(".//div[@class='outer']/div[last()]/section") for td in tds: article = NormalbankItem() article['link'] = BASE_URL + td.xpath(".//a/@href").get() article['push_date'] = 'tag-end' article['text'] = 'tag-end' article['title'] = td.xpath('.//h3/text()').get() yield article
def parse_main(self, response): items = json.loads(response.text)['data']['researches'] for item in items: if item['authorizationType'] == 'Unauthorized': continue article = NormalbankItem() article['title'] = item['title'] article['link'] = BASE_URL + item['url'] article['text'] = item['synopsis'] article['push_date'] = item['publishDate'] yield article
def fitch_headlines(self, response): json_articles = json.loads(response.text)['items'] for json_article in json_articles: article = NormalbankItem() article['title'] = json_article['title'] article['push_date'] = json_article['date'] article[ 'link'] = "https://www.fitchratings.com" + json_article['link'] article['text'] = json_article['text'] yield article
def parse_fsi(self, response): tds = response.xpath("//tbody/tr") for td in tds: item = NormalbankItem() item["title"] = td.xpath("./td[2]/a/text()").extract_first() item["push_date"] = td.xpath( "./td[1]/text()").extract_first().strip() item["link"] = td.xpath(".//a/@href").extract_first() item['text'] = 'tag-end' yield item
def parse_index(self, response): h4 = response.xpath("//*[@id='content-main']/h4") p = response.xpath("//*[@id='content-main']/p") for (i, t) in zip(h4, p): item = NormalbankItem() item["title"] = i.xpath("./a/text()").extract_first() item["push_date"] = t.xpath("./span/text()").extract_first() item['link'] = i.xpath(".//@href").extract_first() item['text'] = 'tag-end' yield item
def spglobal_index2(self, response): text = re.search("searchg2_\d+\((.+)\)$", response.text) tds = json.loads(text.group(1))['response']['docs'] for td in tds: article = NormalbankItem() article['title'] = td['title'] article['push_date'] = td['custom_dt_meta_publish_date'] article['link'] = BASE_URL + td['custom_s_local_url'] article['text'] = "tag-end" yield article
def parse_index(self, response): tds = response.xpath("//div[@class='list-content']/article") for td in tds: article = NormalbankItem() article["link"] = td.xpath( ".//div[@class='image-wrapper']/a/@href").extract_first() article["title"] = td.xpath(".//h4//text()").extract_first() article["push_date"] = td.xpath(".//time//text()").extract_first() article['text'] = 'tag-end' yield article
def fitch_country_risk(self, response): tds = response.xpath("//main//article/div/article") for td in tds: article = NormalbankItem() article['title'] = td.xpath("./h2/a/text()").get() article['link'] = "https://www.fitchsolutions.com" + td.xpath( "./h2/a/@href").get() article['push_date'] = td.xpath("./p/text()[last()]").get() article['text'] = td.xpath("./div//li/text()").get() yield article
def parse_discussion(self, response): tds = response.xpath( "//div[@id='main_0_universal_2_divBlockList']/div") for td in tds: article = NormalbankItem() article['link'] = BASE_URL + td.xpath(".//a/@href").get() article['push_date'] = td.xpath(".//time/text()").get() article['title'] = td.xpath(".//a/h3/text()").get() article['text'] = td.xpath( ".//div[@class='description']/text()").get() yield article
def parse_index(self, response): tds = response.xpath( "//tr[@class='item even'] | //tr[@class='item odd']") for td in tds: item = NormalbankItem() item["title"] = td.xpath("./td[2]/div[1]/a/text()").extract_first() item["push_date"] = td.xpath( "./td[1]/text()").extract_first().strip() item['link'] = td.xpath(".//a/@href").extract_first() item["text"] = "tag-end" yield item
def spglobal_index(self, response): tds = response.xpath( "//div[contains(@class,'carousel__wrapper')]/ul/li") for td in tds: article = NormalbankItem() article['title'] = td.xpath(".//h1/text()").get() article['link'] = BASE_URL + td.xpath("./a/@href").get() article['push_date'] = td.xpath( ".//ul[@class ='meta-data']/li[last()]/text()").get() article['text'] = "tag-end" yield article
def parse_all(self, response): text = response.body.decode("utf-8") tds = json.loads(text)['documents'] for key in tds.keys(): if tds[key].get('url') != None: item = NormalbankItem() item['link'] = tds[key]['url'] item['title'] = tds[key]['title']['cdata!'] item['text'] = tds[key]['descr']['cdata!'] item['push_date'] = tds[key]['lnchdt'] yield item
def parse_main(self, response): tds = response.xpath("//body/a")[0:-1] item = NormalbankItem() for td in tds: item["title"] = td.xpath( "./div[1]/div[4]/h2/text()").extract_first() item["text"] = td.xpath("./div[1]/div[4]/p/text()").extract_first() item['link'] = td.xpath("./@href").extract_first() item["push_date"] = td.xpath( "./div[1]/div[4]/div/span[3]/text()").extract_first() yield item
def fitch_white_papers(self, response): #第二个网页站点,继续传递给下一个站点 tds = response.xpath("//div[@class='content']/article") for td in tds: article = NormalbankItem() article['title'] = td.xpath("./h2//text()").get().strip() article['link'] = "https://www.fitchsolutions.com" + td.xpath( "./a/@href").get().strip() date = td.xpath("./p/text()[last()]").getall() date = "".join(date).strip() article['push_date'] = date article['text'] = td.xpath("./p/a/text()").get().strip() yield article
def parse_whitepapers(self, response): tds = response.xpath("//div[@class='columns']/div/div/article") for td in tds: article = NormalbankItem() article['link'] = BASE_URL + td.xpath("./a/@href").get() article['push_date'] = td.xpath( ".//div[@class='caption']/span[last()]/text()").get() article['title'] = td.xpath( ".//div[@class='tout__details']/h3/text()").get() article['text'] = td.xpath( ".//div[@class='tout__details']/p/text()").get() yield article
def parse_index(self, response): tds = response.xpath("//div[@id='mdcTS2']/p") print(tds) print(len(tds)) for td in tds: article = NormalbankItem() if td.xpath("./a/@href").get() == None: continue article['push_date'] = td.xpath("./font/font/text()").get() article['link'] = BASE_URL + td.xpath("./a/@href").get() article['title'] = td.xpath("./a/font/font/text()").get() article['text'] = 'tag-end' yield article