Ejemplo n.º 1
0
    def parse(self, response):
        scrapItems = response.xpath('//tr[@class="pr-list-page-row"]')

        for scrapItem in scrapItems:
            item = ScrapItem()
            item['headline'] = scrapItem.xpath(
                'td[@headers="view-field-display-title-table-column"]/a/text()'
            ).extract()[0]
            item['article_link'] = 'https://www.sec.gov' + scrapItem.xpath(
                'td[@headers="view-field-display-title-table-column"]/a/@href'
            ).extract()[0]

            datetime_str = scrapItem.xpath(
                'td[@headers="view-field-publish-date-table-column"]/time/text()'
            ).extract()[0]
            datetime_str = datetime_str.replace('April', 'Apr')
            datetime_str = datetime_str.replace('March', 'Mar')
            datetime_str = datetime_str.replace('June', 'Jun')
            datetime_str = datetime_str.replace('July', 'Jul')
            datetime_str = datetime_str.replace('Sept', 'Sep')
            try:
                item['date'] = datetime.strptime(
                    datetime_str, '%b. %d, %Y').strftime('%Y-%m-%d')
            except:
                item['date'] = datetime.strptime(
                    datetime_str, '%b %d, %Y').strftime('%Y-%m-%d')

            item['source_site'] = self.start_urls[0]
            item['created_at'] = datetime.today().strftime('%Y-%m-%d')

            self.items.append(item)
            yield scrapy.Request(item['article_link'],
                                 callback=self.parse_dir_contents)
Ejemplo n.º 2
0
    def parse(self, response):
        scrapItems = response.xpath(
            '//table/tbody/tr[contains(@class, "speeches-list-row")]')

        for scrapItem in scrapItems:
            item = ScrapItem()

            item['headline'] = scrapItem.xpath(
                'td[@headers="view-field-display-title-table-column"]/a/text()'
            ).extract()[0]
            item['article_link'] = 'https://www.sec.gov' + scrapItem.xpath(
                'td[@headers="view-field-display-title-table-column"]/a/@href'
            ).extract()[0]

            datetime_str = scrapItem.xpath(
                'td[@headers="view-field-publish-date-table-column"]/time/@datetime'
            ).extract()[0]
            item['date'] = datetime_str

            item['source_site'] = self.start_urls[0]
            item['created_at'] = datetime.today().strftime('%Y-%m-%d')

            self.items.append(item)
            yield scrapy.Request(item['article_link'],
                                 callback=self.parse_dir_contents)
Ejemplo n.º 3
0
    def parse(self, response):

        scrapItems = response.xpath('//div[contains(@class, "views-row")]')

        for scrapItem in scrapItems:
            item = ScrapItem()
            item['headline'] = scrapItem.xpath(
                'div[@class="views-field views-field-title"]/span[@class="field-content"]/a/text()'
            ).extract()[0]
            item['article_link'] = 'https://www.fincen.gov' + scrapItem.xpath(
                'div[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href'
            ).extract()[0]
            datetime_str = scrapItem.xpath(
                'span[@class="views-field views-field-field-date-release"]/span[@class="field-content"]/time/text()'
            ).extract()[0]
            item['date'] = datetime.strptime(datetime_str,
                                             '%m/%d/%Y').strftime('%Y-%m-%d')

            item['source_site'] = self.start_urls[0]
            item['created_at'] = datetime.today().strftime('%Y-%m-%d')

            # yield item
            self.items.append(item)
            yield scrapy.Request(item['article_link'],
                                 callback=self.parse_dir_contents)
    def parse(self, response):
        scrapItemsTables = response.xpath('//table')
        scrapItemsRows = scrapItemsTables[6].xpath('tr')
        
        i = 1
        while i<len(scrapItemsRows):
            item = ScrapItem()
            item['headline'] = scrapItemsRows[i].xpath('td/a/text()').extract()[0]
            item['article_link'] = 'https://www.sec.gov' + scrapItemsRows[i].xpath('td/a/@href').extract()[0]
            item['date'] = scrapItemsRows[i+1].xpath('td/text()').extract()[5]
            item['source_site'] = self.start_urls[0]
            item['created_at'] = datetime.today().strftime('%Y-%m-%d')

            i += 2
            yield item
Ejemplo n.º 5
0
    def parse(self, response):
        scrapItems = response.xpath('//table/tbody/tr')
        
        for scrapItem in scrapItems:
            item = ScrapItem()

            item['headline'] = scrapItem.xpath('td[@headers="view-field-pdf-link-table-column"]/a/text()').extract()[0]
            item['article_link'] = 'https://www.cftc.gov' + scrapItem.xpath('td[@headers="view-field-pdf-link-table-column"]/a/@href').extract()[0]
            
            datetime_str = scrapItem.xpath('td[@headers="view-field-date-table-column"]/time/text()').extract()[0]
            item['date'] = datetime.strptime(datetime_str, '%m/%d/%Y').strftime('%Y-%m-%d')

            item['source_site'] = self.start_urls[0]
            item['created_at'] = datetime.today().strftime('%Y-%m-%d')

            self.items.append(item)
            yield scrapy.Request(item['article_link'], callback = self.parse_dir_contents)
    def parse(self, response):
        scrapItems = response.xpath('//table/tbody/tr')

        i = 2
        while i < len(scrapItems):
            item = ScrapItem()

            tds = scrapItems[i].xpath('td')
            item['headline'] = tds[2].xpath(
                'b[@class="blue"]/text()').extract()[0]
            item['article_link'] = 'https://www.sec.gov' + tds[0].xpath(
                'a/@href').extract()[0]

            datetime_str = tds[1].xpath('text()').extract()[0]
            item['date'] = item['date'] = datetime.strptime(
                datetime_str, '%b. %d, %Y').strftime('%Y-%m-%d')

            item['source_site'] = self.start_urls[0]
            item['created_at'] = datetime.today().strftime('%Y-%m-%d')
            yield item
            i += 2
Ejemplo n.º 7
0
    def parse(self, response):
        scrapItems = response.xpath('//div[@class="media"]')

        for scrapItem in scrapItems:
            item = ScrapItem()
            item['headline'] = scrapItem.xpath('h3/a/text()').extract()[0].strip()
            item['article_link'] = 'https://www.irs.gov' + scrapItem.xpath('h3/a/@href').extract()[0]

            description = scrapItem.xpath('div/text()').extract()[0].split(" ")
            date = description[1] + " "+ description[2] + description[3]

            try:
                item['date'] = datetime.strptime(date, '%B %d,%Y').strftime('%Y-%m-%d')
            except:
                item['date'] = datetime.today().strftime('%Y-%m-%d')

            item['source_site'] = self.start_urls[0]
            item['created_at'] = datetime.today().strftime('%Y-%m-%d')

            self.items.append(item)
            yield scrapy.Request(item['article_link'], callback = self.parse_dir_contents)
    def parse(self, response):
        scrapItems = response.xpath('//table[@id="mainlist"]/tbody/tr')
        print(scrapItems)

        for scrapItem in scrapItems:
            item = ScrapItem()
            try:
                if scrapItem.xpath("@id").extract()[0] == 'firstq':
                    continue
            except:
                tds = scrapItem.xpath('td')
                item['headline'] = tds[2].xpath(
                    'b[@class="blue"]/text()').extract()[0]
                item['article_link'] = 'https://www.sec.gov' + tds[0].xpath(
                    'a/@href').extract()[0]

                datetime_str = tds[1].xpath('text()').extract()[0]
                item['date'] = item['date'] = datetime.strptime(
                    datetime_str, '%b. %d, %Y').strftime('%Y-%m-%d')

                item['source_site'] = self.start_urls[0]
                item['created_at'] = datetime.today().strftime('%Y-%m-%d')

                yield item
Ejemplo n.º 9
0
    def parse(self, response):
        lastestNews = response.xpath('//div[@class="LatestNews-newsFeed"]')

        for lastestNew in lastestNews:
            inner = lastestNew.xpath('div[@class="LatestNews-newsFeedInner"]')
            headline = inner.xpath('div[@class="LatestNews-headline"]')
            item = ScrapItem()
            item['headline'] = headline.xpath('a/text()').extract()[0]
            item['article_link'] = headline.xpath(
                'a/@href').extract()[1] if headline.xpath('a/@href').extract(
                )[0] == '/pro/' else headline.xpath('a/@href').extract()[0]
            item['date'] = datetime.today().strftime('%Y-%m-%d')
            item['source_site'] = self.start_urls[0]
            item['created_at'] = datetime.today().strftime('%Y-%m-%d')

            self.items.append(item)
            yield scrapy.Request(item['article_link'],
                                 callback=self.parse_dir_contents)

        riverPlusItems = response.xpath(
            '//div[@class="RiverHeadline-headline RiverHeadline-hasThumbnail"]'
        )

        for riverPlusItem in riverPlusItems:
            item = ScrapItem()
            item['headline'] = riverPlusItem.xpath('a/text()').extract()[0]
            item['article_link'] = riverPlusItem.xpath('a/@href').extract(
            )[1] if riverPlusItem.xpath('a/@href').extract(
            )[0] == '/pro/' else riverPlusItem.xpath('a/@href').extract()[0]
            item['date'] = datetime.today().strftime('%Y-%m-%d')
            item['source_site'] = self.start_urls[0]
            item['created_at'] = datetime.today().strftime('%Y-%m-%d')

            self.items.append(item)
            yield scrapy.Request(item['article_link'],
                                 callback=self.parse_dir_contents)

        otherItems = response.xpath('//div[@class="Card-titleContainer"]')

        for otherItem in otherItems:
            print(otherItem)
            item = ScrapItem()
            item['headline'] = otherItem.xpath(
                'a[@class="Card-title"]/div/text()').extract()[0]
            item['article_link'] = otherItem.xpath(
                'a[@class="Card-title"]/@href').extract()[0]
            item['date'] = datetime.today().strftime('%Y-%m-%d')
            item['source_site'] = self.start_urls[0]
            item['created_at'] = datetime.today().strftime('%Y-%m-%d')

            # yield item
            self.items.append(item)
            yield scrapy.Request(item['article_link'],
                                 callback=self.parse_dir_contents)

        lazyItems = response.xpath(
            '//li[@class="LazyLoaderPlaceholder-gridItem"]')

        for lazyItem in lazyItems:
            item = ScrapItem()
            item['headline'] = lazyItem.xpath('a/text()').extract()[0]
            item['article_link'] = lazyItem.xpath('a/@href').extract()[0]
            item['date'] = datetime.today().strftime('%Y-%m-%d')
            item['source_site'] = self.start_urls[0]
            item['created_at'] = datetime.today().strftime('%Y-%m-%d')

            # yield item
            self.items.append(item)
            yield scrapy.Request(item['article_link'],
                                 callback=self.parse_dir_contents)