Beispiel #1
0
    key = iter(self.bucket.list(prefix=prefix)).next()
    urls = [key.generate_url(seconds_good_for,force_http=True)]
    
    return self.input_stream, urls
    
    
  def segment_between(self, start,end):

    prefix = self._key_for_datetime(start)
    urls = []
    
    limit = self.rule._params.get('maxinput', float('inf'))
    
    for key in self.bucket.list(prefix=prefix):
      dt = self._datetime_for_key(key)
      if dt > end:
        break
      else:
        limit -= 1
        if limit < 0:
          break
        if key.size > 0:
          urls.append(key.generate_url(seconds_good_for,force_http=True))
          
        
    return urls
    

datasources.set_source_for_url(CommonCrawlSource, 's3://aws-publicdatasets/common-crawl/crawl-002/')
datasources.set_source_for_url(CommonCrawlSource, 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/')
Beispiel #2
0
            if dt is None:
                continue

            if dt > end:
                break
            else:
                limit -= 1
                if limit < 0:
                    break
                if key.size > 0:
                    urls.append(self.generate_url(key, force_http=True))

        return urls


datasources.set_source_for_url(
    CommonCrawlSource, 's3://aws-publicdatasets/common-crawl/crawl-002/')
datasources.set_source_for_url(
    CommonCrawlSource,
    'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/')

import calendar


class CommonCrawl2012Source(CommonCrawlSource):
    @property
    def segments(self):
        #Q: How long to source objects live?
        if not hasattr(self, '_segments'):
            key = self.bucket.lookup(
                'common-crawl/parse-output/valid_segments.txt')
            self._segments = sorted([
      if dt is None:
        continue
        
      if dt > end:
        break
      else:
        limit -= 1
        if limit < 0:
          break
        if key.size > 0:
          urls.append(self.generate_url(key,force_http=True))
          
    return urls
    

datasources.set_source_for_url(CommonCrawlSource, 's3://aws-publicdatasets/common-crawl/crawl-002/')
datasources.set_source_for_url(CommonCrawlSource, 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/')

import calendar
class CommonCrawl2012Source(CommonCrawlSource):
  
  @property
  def segments(self):
    #Q: How long to source objects live?
    if not hasattr(self,'_segments'):
      key = self.bucket.lookup('common-crawl/parse-output/valid_segments.txt')
      self._segments = sorted([datetime.datetime.utcfromtimestamp(int(k)/1000.) for k in key.read().split('\n') if k])
    return self._segments
      
  def earliest_record_time(self):
    # Grab and parse the first key