Example #1
0
    def parse_image_contents(self, response):
        item = SolarmonitorItem()
        item['type'] = response.xpath('//title/text()').extract()[0]

        query_string = urlparse(response.url).query
        date = parse_qs(query_string)['date'][0]
        item['date'] = date

        img_src = response.xpath('/html/body/center/table//td/img/@src').extract()
        item['image_urls'] = [response.urljoin(i) for i in img_src]

        item['number'] = response.xpath('//td[@id="noaa_number"]/a/text()').extract()

        locations = response.xpath('//td[@id="position"]')
        item['location'] = [' '.join(location.xpath('text()').extract()) for location in locations]
        item['hale_class'] = response.xpath('//td[@id="hale"]/text()').extract()
        item['mcintosh_class'] = response.xpath('//td[@id="mcintosh"]/text()').extract()
        item['area'] = response.xpath('//td[@id="area"]/text()').extract()
        item['number_of_spots'] = response.xpath('//td[@id="nspots"]/text()').extract()

        flares = response.xpath('//td[@id="events"]')
        item['flares'] = [','.join(flare.xpath('a/text()').extract()) for flare in flares]

        yield item

        if convert_to_datetime(date) >= self.final_date:
            href = response.xpath('//a[@title="-1 day"]/@href').extract()
            url = response.urljoin(href[0])
            yield scrapy.Request(url, callback=self.parse_image_contents)
Example #2
0
    def __init__(self, final_date=None, **kwargs):
        super(SolarmonitorSpider, self).__init__(**kwargs)

        if final_date:
            final_date = convert_to_datetime(final_date)
        else:
            final_date = datetime.datetime.today() - datetime.timedelta(days=5 * 365)

        self.final_date = final_date