def parse(self, response): scrapItems = response.xpath('//tr[@class="pr-list-page-row"]') for scrapItem in scrapItems: item = ScrapItem() item['headline'] = scrapItem.xpath( 'td[@headers="view-field-display-title-table-column"]/a/text()' ).extract()[0] item['article_link'] = 'https://www.sec.gov' + scrapItem.xpath( 'td[@headers="view-field-display-title-table-column"]/a/@href' ).extract()[0] datetime_str = scrapItem.xpath( 'td[@headers="view-field-publish-date-table-column"]/time/text()' ).extract()[0] datetime_str = datetime_str.replace('April', 'Apr') datetime_str = datetime_str.replace('March', 'Mar') datetime_str = datetime_str.replace('June', 'Jun') datetime_str = datetime_str.replace('July', 'Jul') datetime_str = datetime_str.replace('Sept', 'Sep') try: item['date'] = datetime.strptime( datetime_str, '%b. %d, %Y').strftime('%Y-%m-%d') except: item['date'] = datetime.strptime( datetime_str, '%b %d, %Y').strftime('%Y-%m-%d') item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') self.items.append(item) yield scrapy.Request(item['article_link'], callback=self.parse_dir_contents)
def parse(self, response): scrapItems = response.xpath( '//table/tbody/tr[contains(@class, "speeches-list-row")]') for scrapItem in scrapItems: item = ScrapItem() item['headline'] = scrapItem.xpath( 'td[@headers="view-field-display-title-table-column"]/a/text()' ).extract()[0] item['article_link'] = 'https://www.sec.gov' + scrapItem.xpath( 'td[@headers="view-field-display-title-table-column"]/a/@href' ).extract()[0] datetime_str = scrapItem.xpath( 'td[@headers="view-field-publish-date-table-column"]/time/@datetime' ).extract()[0] item['date'] = datetime_str item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') self.items.append(item) yield scrapy.Request(item['article_link'], callback=self.parse_dir_contents)
def parse(self, response): scrapItems = response.xpath('//div[contains(@class, "views-row")]') for scrapItem in scrapItems: item = ScrapItem() item['headline'] = scrapItem.xpath( 'div[@class="views-field views-field-title"]/span[@class="field-content"]/a/text()' ).extract()[0] item['article_link'] = 'https://www.fincen.gov' + scrapItem.xpath( 'div[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href' ).extract()[0] datetime_str = scrapItem.xpath( 'span[@class="views-field views-field-field-date-release"]/span[@class="field-content"]/time/text()' ).extract()[0] item['date'] = datetime.strptime(datetime_str, '%m/%d/%Y').strftime('%Y-%m-%d') item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') # yield item self.items.append(item) yield scrapy.Request(item['article_link'], callback=self.parse_dir_contents)
def parse(self, response): scrapItemsTables = response.xpath('//table') scrapItemsRows = scrapItemsTables[6].xpath('tr') i = 1 while i<len(scrapItemsRows): item = ScrapItem() item['headline'] = scrapItemsRows[i].xpath('td/a/text()').extract()[0] item['article_link'] = 'https://www.sec.gov' + scrapItemsRows[i].xpath('td/a/@href').extract()[0] item['date'] = scrapItemsRows[i+1].xpath('td/text()').extract()[5] item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') i += 2 yield item
def parse(self, response): scrapItems = response.xpath('//table/tbody/tr') for scrapItem in scrapItems: item = ScrapItem() item['headline'] = scrapItem.xpath('td[@headers="view-field-pdf-link-table-column"]/a/text()').extract()[0] item['article_link'] = 'https://www.cftc.gov' + scrapItem.xpath('td[@headers="view-field-pdf-link-table-column"]/a/@href').extract()[0] datetime_str = scrapItem.xpath('td[@headers="view-field-date-table-column"]/time/text()').extract()[0] item['date'] = datetime.strptime(datetime_str, '%m/%d/%Y').strftime('%Y-%m-%d') item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') self.items.append(item) yield scrapy.Request(item['article_link'], callback = self.parse_dir_contents)
def parse(self, response): scrapItems = response.xpath('//table/tbody/tr') i = 2 while i < len(scrapItems): item = ScrapItem() tds = scrapItems[i].xpath('td') item['headline'] = tds[2].xpath( 'b[@class="blue"]/text()').extract()[0] item['article_link'] = 'https://www.sec.gov' + tds[0].xpath( 'a/@href').extract()[0] datetime_str = tds[1].xpath('text()').extract()[0] item['date'] = item['date'] = datetime.strptime( datetime_str, '%b. %d, %Y').strftime('%Y-%m-%d') item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') yield item i += 2
def parse(self, response): scrapItems = response.xpath('//div[@class="media"]') for scrapItem in scrapItems: item = ScrapItem() item['headline'] = scrapItem.xpath('h3/a/text()').extract()[0].strip() item['article_link'] = 'https://www.irs.gov' + scrapItem.xpath('h3/a/@href').extract()[0] description = scrapItem.xpath('div/text()').extract()[0].split(" ") date = description[1] + " "+ description[2] + description[3] try: item['date'] = datetime.strptime(date, '%B %d,%Y').strftime('%Y-%m-%d') except: item['date'] = datetime.today().strftime('%Y-%m-%d') item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') self.items.append(item) yield scrapy.Request(item['article_link'], callback = self.parse_dir_contents)
def parse(self, response): scrapItems = response.xpath('//table[@id="mainlist"]/tbody/tr') print(scrapItems) for scrapItem in scrapItems: item = ScrapItem() try: if scrapItem.xpath("@id").extract()[0] == 'firstq': continue except: tds = scrapItem.xpath('td') item['headline'] = tds[2].xpath( 'b[@class="blue"]/text()').extract()[0] item['article_link'] = 'https://www.sec.gov' + tds[0].xpath( 'a/@href').extract()[0] datetime_str = tds[1].xpath('text()').extract()[0] item['date'] = item['date'] = datetime.strptime( datetime_str, '%b. %d, %Y').strftime('%Y-%m-%d') item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') yield item
def parse(self, response): lastestNews = response.xpath('//div[@class="LatestNews-newsFeed"]') for lastestNew in lastestNews: inner = lastestNew.xpath('div[@class="LatestNews-newsFeedInner"]') headline = inner.xpath('div[@class="LatestNews-headline"]') item = ScrapItem() item['headline'] = headline.xpath('a/text()').extract()[0] item['article_link'] = headline.xpath( 'a/@href').extract()[1] if headline.xpath('a/@href').extract( )[0] == '/pro/' else headline.xpath('a/@href').extract()[0] item['date'] = datetime.today().strftime('%Y-%m-%d') item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') self.items.append(item) yield scrapy.Request(item['article_link'], callback=self.parse_dir_contents) riverPlusItems = response.xpath( '//div[@class="RiverHeadline-headline RiverHeadline-hasThumbnail"]' ) for riverPlusItem in riverPlusItems: item = ScrapItem() item['headline'] = riverPlusItem.xpath('a/text()').extract()[0] item['article_link'] = riverPlusItem.xpath('a/@href').extract( )[1] if riverPlusItem.xpath('a/@href').extract( )[0] == '/pro/' else riverPlusItem.xpath('a/@href').extract()[0] item['date'] = datetime.today().strftime('%Y-%m-%d') item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') self.items.append(item) yield scrapy.Request(item['article_link'], callback=self.parse_dir_contents) otherItems = response.xpath('//div[@class="Card-titleContainer"]') for otherItem in otherItems: print(otherItem) item = ScrapItem() item['headline'] = otherItem.xpath( 'a[@class="Card-title"]/div/text()').extract()[0] item['article_link'] = otherItem.xpath( 'a[@class="Card-title"]/@href').extract()[0] item['date'] = datetime.today().strftime('%Y-%m-%d') item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') # yield item self.items.append(item) yield scrapy.Request(item['article_link'], callback=self.parse_dir_contents) lazyItems = response.xpath( '//li[@class="LazyLoaderPlaceholder-gridItem"]') for lazyItem in lazyItems: item = ScrapItem() item['headline'] = lazyItem.xpath('a/text()').extract()[0] item['article_link'] = lazyItem.xpath('a/@href').extract()[0] item['date'] = datetime.today().strftime('%Y-%m-%d') item['source_site'] = self.start_urls[0] item['created_at'] = datetime.today().strftime('%Y-%m-%d') # yield item self.items.append(item) yield scrapy.Request(item['article_link'], callback=self.parse_dir_contents)