def __init__(self, startDate, endDate, *args, **kwargs): try: toolbox.validate_date_range(startDate, endDate) except ValueError as e: raise ValueError(e.message) startDate = dparser.parse(startDate).replace(day=1).replace(tzinfo=None) endDate = dparser.parse(endDate).replace(day=1).replace(tzinfo=None) endDate = endDate.replace(day=calendar.monthrange(int(endDate.strftime('%Y')), int(endDate.strftime('%m')))[1]) endDate = endDate + datetime.timedelta(hours=23, minutes=59, seconds=59) try: response = self.fetch_json_doc(0) hits = response['response']['meta']['hits'] offset = response['response']['meta']['offset'] i = 1 while hits > offset: for n in response['response']['docs']: pubDate = dparser.parse(n['pub_date'], fuzzy=True).replace(tzinfo=None) if pubDate >= startDate and pubDate <= endDate: self.start_urls.append(n['web_url']) log.msg(n['web_url'], level=log.DEBUG) response = self.fetch_json_doc(i) hits = response['response']['meta']['hits'] offset = response['response']['meta']['offset'] i += 1 except KeyError: self.msg("Error reading init-Page for spider " + self.name + ".\n\ #The required response.meta-element was not contained in the response.", level=log.ERROR) raise KeyError("Required Key in dict not found.") super(MySpider, self).__init__(*args, **kwargs)
def __init__(self, startDate, endDate, *args, **kwargs): urls = [] #validating, parsing and converting the date/time-stuff try: toolbox.validate_date_range(startDate, endDate) except ValueError as e: raise ValueError(e.message) startDate = dparser.parse(startDate).replace(day=1).replace(tzinfo=None) endDate = dparser.parse(endDate).replace(tzinfo=None) endDate = endDate.replace(day=calendar.monthrange(int(endDate.strftime('%Y')), int(endDate.strftime('%m')))[1]) endDate = endDate + datetime.timedelta(hours=23, minutes=59, seconds=59) try: json_page = self.fetch_json_doc(0) except urllib2.HTTPError: raise urllib2.HTTPError('The init-Page could not be fetched. Aborting.') i = 0 while json_page['more_posts_next_page'] == True: pubDate = datetime.datetime.now() #for initializing-reasons for n in json_page['posts']: sel = Selector(text=n['html']) pubDate = dparser.parse(safepop(sel.xpath('//time/@datetime').extract(), 0)).replace(tzinfo=None) if pubDate >= startDate and pubDate <= endDate: url = safepop(sel.xpath('//div[@data-url]/attribute::data-url').extract(), 0) urls.append(url) if pubDate < startDate: break i += 1 json_page = self.fetch_json_doc(i) #removing duplicates for url in urls: if self.start_urls.count(url) == 0: self.start_urls.append(url) super(MySpider, self).__init__(*args, **kwargs)
def __init__(self, startDate, endDate, *args, **kwargs): try: startDate, endDate = toolbox.validate_date_range(startDate, endDate) except ValueError as e: raise ValueError(e.message) for i in range(0, (endDate.year-startDate.year) * 12 + (endDate.month-startDate.month) + 1): url = 'http://marginalrevolution.com/marginalrevolution/' + toolbox.add_months(startDate, i).strftime('%Y/%m') self.start_urls.append(url) super(MySpider, self).__init__(*args, **kwargs)
def __init__(self, startDate, endDate, *args, **kwargs): try: startDate, endDate = toolbox.validate_date_range(startDate, endDate) except ValueError as e: raise ValueError(e.message) for i in range(0, (endDate.year-startDate.year) * 12 + (endDate.month-startDate.month) + 1): url = 'http://www.voxeu.org/columns/archive/' + toolbox.add_months(startDate, i).strftime('%Y-%m') self.start_urls.append(url) super(MySpider, self).__init__(*args, **kwargs)