Example #1
0
    def __init__(self, startDate, endDate, *args, **kwargs):
        try:
            toolbox.validate_date_range(startDate, endDate)
        except ValueError as e:
            raise ValueError(e.message)
        
        startDate = dparser.parse(startDate).replace(day=1).replace(tzinfo=None)
        endDate = dparser.parse(endDate).replace(day=1).replace(tzinfo=None)
        endDate = endDate.replace(day=calendar.monthrange(int(endDate.strftime('%Y')), int(endDate.strftime('%m')))[1])
        endDate = endDate + datetime.timedelta(hours=23, minutes=59, seconds=59)        
        
        try:
            response = self.fetch_json_doc(0)
            hits = response['response']['meta']['hits']
            offset = response['response']['meta']['offset']
            i = 1
            while hits > offset:
                for n in response['response']['docs']:
                    pubDate = dparser.parse(n['pub_date'], fuzzy=True).replace(tzinfo=None)
                    if pubDate >= startDate and pubDate <= endDate:
                        self.start_urls.append(n['web_url'])
                        log.msg(n['web_url'], level=log.DEBUG)
                response = self.fetch_json_doc(i)
                hits = response['response']['meta']['hits']
                offset = response['response']['meta']['offset']
                i += 1                
        except KeyError:
            self.msg("Error reading init-Page for spider " + self.name + ".\n\
#The required response.meta-element was not contained in the response.", level=log.ERROR)
            raise KeyError("Required Key in dict not found.")
        super(MySpider, self).__init__(*args, **kwargs)
Example #2
0
 def __init__(self, startDate, endDate, *args, **kwargs):
     urls = []
     #validating, parsing and converting the date/time-stuff
     try:
         toolbox.validate_date_range(startDate, endDate)
     except ValueError as e:
         raise ValueError(e.message)
     startDate = dparser.parse(startDate).replace(day=1).replace(tzinfo=None)
     endDate = dparser.parse(endDate).replace(tzinfo=None)
     endDate = endDate.replace(day=calendar.monthrange(int(endDate.strftime('%Y')), int(endDate.strftime('%m')))[1])
     endDate = endDate + datetime.timedelta(hours=23, minutes=59, seconds=59)
     
     try:
         json_page = self.fetch_json_doc(0)
     except urllib2.HTTPError:
         raise urllib2.HTTPError('The init-Page could not be fetched. Aborting.')
     i = 0
     while json_page['more_posts_next_page'] == True:
         pubDate = datetime.datetime.now() #for initializing-reasons
         for n in json_page['posts']:
             sel = Selector(text=n['html'])
             pubDate = dparser.parse(safepop(sel.xpath('//time/@datetime').extract(), 0)).replace(tzinfo=None)
             if pubDate >= startDate and pubDate <= endDate:
                 url = safepop(sel.xpath('//div[@data-url]/attribute::data-url').extract(), 0)
                 urls.append(url)
         if pubDate < startDate:
             break
         i += 1
         json_page = self.fetch_json_doc(i)
     #removing duplicates
     for url in urls:
         if self.start_urls.count(url) == 0:
             self.start_urls.append(url)
                        
     super(MySpider, self).__init__(*args, **kwargs)
 def __init__(self, startDate, endDate, *args, **kwargs):
     try:
         startDate, endDate = toolbox.validate_date_range(startDate, endDate)
     except ValueError as e:
         raise ValueError(e.message)
     
     for i in range(0, (endDate.year-startDate.year) * 12 + (endDate.month-startDate.month) + 1):
         url = 'http://marginalrevolution.com/marginalrevolution/' + toolbox.add_months(startDate, i).strftime('%Y/%m')
         self.start_urls.append(url)
     super(MySpider, self).__init__(*args, **kwargs)
Example #4
0
 def __init__(self, startDate, endDate, *args, **kwargs):  
     try:
         startDate, endDate = toolbox.validate_date_range(startDate, endDate)
     except ValueError as e:
         raise ValueError(e.message)
     
     for i in range(0, (endDate.year-startDate.year) * 12 + (endDate.month-startDate.month) + 1):
         url = 'http://www.voxeu.org/columns/archive/' + toolbox.add_months(startDate, i).strftime('%Y-%m')
         self.start_urls.append(url)
     super(MySpider, self).__init__(*args, **kwargs)