Ejemplo n.º 1
0
  def test_sourcing_common_crawl_2012_metadata(self):
    # shut boto's debug messaging up during tests
    logging.getLogger('boto').setLevel(logging.INFO)
 
    with patch('boto.connect_s3', lambda a1,a2:self.s3_mock()):
      datasources.load()
      url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment?metadata'
      source = datasources.source_for(url)

      source.rule = FakeRule()
      eq_(source.__class__, datasources.common_crawl.CommonCrawl2012MetadataSource)
      start = datetime(2012, 7, 7, 19, 42, 27, 253000)

      eq_(source.earliest_record_time(), start)
      end = start + timedelta(days=1)

      urls = source.segment_between(start,end)

      eq_(len(urls), 7)
      
      # day's without data should return an empty list
      urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00))
      self.assertSequenceEqual(
        urls,
        []
      )
Ejemplo n.º 2
0
    def test_sourcing_common_crawl_2010(self):
        # shut boto's debug messaging up during tests
        logging.getLogger('boto').setLevel(logging.INFO)

        datasources.load()
        url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/crawl-002/'
        source = datasources.source_for(url)

        source.rule = FakeRule()
        assert isinstance(source, datasources.common_crawl.CommonCrawlSource)
        start = datetime(2009, 9, 17, 00)
        eq_(source.earliest_record_time(), start)
        end = start + timedelta(days=1)

        urls = source.segment_between(start, end)
        # note the mocks3 connection doesn't bother to actually sign the urls
        self.assertSequenceEqual(urls, [
            'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_0.arc.gz?',
            'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_1.arc.gz?',
            'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_2.arc.gz?',
        ])

        # day's without data should return an empty list
        urls = source.segment_between(datetime(2009, 9, 21, 00),
                                      datetime(2009, 9, 22, 00))
        self.assertSequenceEqual(urls, [])

        urls = source.segment_between(datetime(2009, 10, 01, 00),
                                      datetime(2009, 10, 02, 00))
        self.assertSequenceEqual(urls, [
            'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/10/01/0/1253228619531_0.arc.gz?'
        ])
Ejemplo n.º 3
0
    def test_sourcing_common_crawl_2012_metadata(self):
        # shut boto's debug messaging up during tests
        logging.getLogger('boto').setLevel(logging.INFO)

        with patch('boto.connect_s3', lambda a1, a2: self.s3_mock()):
            datasources.load()
            url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment?metadata'
            source = datasources.source_for(url)

            source.rule = FakeRule()
            eq_(source.__class__,
                datasources.common_crawl.CommonCrawl2012MetadataSource)
            start = datetime(2012, 7, 7, 19, 42, 27, 253000)

            eq_(source.earliest_record_time(), start)
            end = start + timedelta(days=1)

            urls = source.segment_between(start, end)

            eq_(len(urls), 7)

            # day's without data should return an empty list
            urls = source.segment_between(datetime(2009, 9, 21, 00),
                                          datetime(2009, 9, 22, 00))
            self.assertSequenceEqual(urls, [])
Ejemplo n.º 4
0
def map_input_stream(stream, size, url, params):
  from disco.util import schemesplit
  import disco.func
  from triv.io import datasources, task
  
  datasources.load()
  task.push(Task)
  input_stream = datasources.input_stream_for(stream, size, url, params)
  if input_stream:
    # Note: Task is a global set by disco, we push it onto the context stap
    # which will allow it to be imported by the modules that need it
    return input_stream
  else:
    # we don't handle the given url, see if vanilla disco moduels can
    task.pop() # this is normally cleared when we're done iterating
    return disco.func.map_input_stream(stream,size,url,params)
Ejemplo n.º 5
0
def map_input_stream(stream, size, url, params):
    from disco.util import schemesplit
    import disco.func
    from triv.io import datasources, task

    datasources.load()
    task.push(Task)
    input_stream = datasources.input_stream_for(stream, size, url, params)
    if input_stream:
        # Note: Task is a global set by disco, we push it onto the context stap
        # which will allow it to be imported by the modules that need it
        return input_stream
    else:
        # we don't handle the given url, see if vanilla disco moduels can
        task.pop()  # this is normally cleared when we're done iterating
        return disco.func.map_input_stream(stream, size, url, params)
Ejemplo n.º 6
0
  def test_sourcing_common_crawl_2012(self):
    # shut boto's debug messaging up during tests
    logging.getLogger('boto').setLevel(logging.INFO)
   
    with patch('boto.connect_s3', lambda a1,a2:self.s3_mock()):
      datasources.load()
      url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment/'
      source = datasources.source_for(url)

      source.rule = FakeRule()
      eq_(source.__class__, datasources.common_crawl.CommonCrawl2012Source)
      
      start = datetime(2012, 7, 7, 19, 42, 27, 253000)
      eq_(source.earliest_record_time(), start)
      end = start + timedelta(days=1)

      urls = source.segment_between(start,end)
      
      eq_(len(urls), 71)
      
      eq_(
        urls[0],
        'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690147253/1341708194364_11.arc.gz?'
      )
      
      eq_(
        urls[-1],
        'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690150308/1341690944267_36.arc.gz?'
      )
      
      
      # day's without data should return an empty list
      urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00))
      self.assertSequenceEqual(
        urls,
        []
      )

      urls = source.segment_between(datetime(2012,9,07, 00), datetime(2012,9,30, 23))      
      eq_(len(urls), 10)
      
      
      # we should see everything if we query for the whole year
      urls = source.segment_between(datetime(2012,1,01, 00), datetime(2012,12,31, 23))      
      eq_(len(urls), 81)
Ejemplo n.º 7
0
    def test_sourcing_common_crawl_2012(self):
        # shut boto's debug messaging up during tests
        logging.getLogger('boto').setLevel(logging.INFO)

        with patch('boto.connect_s3', lambda a1, a2: self.s3_mock()):
            datasources.load()
            url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment/'
            source = datasources.source_for(url)

            source.rule = FakeRule()
            eq_(source.__class__,
                datasources.common_crawl.CommonCrawl2012Source)

            start = datetime(2012, 7, 7, 19, 42, 27, 253000)
            eq_(source.earliest_record_time(), start)
            end = start + timedelta(days=1)

            urls = source.segment_between(start, end)

            eq_(len(urls), 71)

            eq_(
                urls[0],
                'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690147253/1341708194364_11.arc.gz?'
            )

            eq_(
                urls[-1],
                'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690150308/1341690944267_36.arc.gz?'
            )

            # day's without data should return an empty list
            urls = source.segment_between(datetime(2009, 9, 21, 00),
                                          datetime(2009, 9, 22, 00))
            self.assertSequenceEqual(urls, [])

            urls = source.segment_between(datetime(2012, 9, 07, 00),
                                          datetime(2012, 9, 30, 23))
            eq_(len(urls), 10)

            # we should see everything if we query for the whole year
            urls = source.segment_between(datetime(2012, 1, 01, 00),
                                          datetime(2012, 12, 31, 23))
            eq_(len(urls), 81)
Ejemplo n.º 8
0
  def test_sourcing_common_crawl_2010(self):
    # shut boto's debug messaging up during tests
    logging.getLogger('boto').setLevel(logging.INFO)
    

    datasources.load()
    url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/crawl-002/'
    source = datasources.source_for(url)

    source.rule = FakeRule()
    assert isinstance(source, datasources.common_crawl.CommonCrawlSource)
    start = datetime(2009,9,17, 00)
    eq_(source.earliest_record_time(), start)
    end = start + timedelta(days=1)
    
    urls = source.segment_between(start,end)
    # note the mocks3 connection doesn't bother to actually sign the urls
    self.assertSequenceEqual(
      urls,
      [
        'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_0.arc.gz?',
        'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_1.arc.gz?',
        'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_2.arc.gz?',
      ] 
    )
    
    # day's without data should return an empty list
    urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00))
    self.assertSequenceEqual(
      urls,
      []
    )
    
    urls = source.segment_between(datetime(2009,10,01, 00), datetime(2009,10,02, 00))
    self.assertSequenceEqual(
      urls,
      ['http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/10/01/0/1253228619531_0.arc.gz?']
    )
Ejemplo n.º 9
0
 def setUp(self):
     datasources.load()
Ejemplo n.º 10
0
 def setUp(self):
   datasources.load()