Esempio n. 1
0
    def test_sourcing_common_crawl_2010(self):
        # shut boto's debug messaging up during tests
        logging.getLogger('boto').setLevel(logging.INFO)

        datasources.load()
        url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/crawl-002/'
        source = datasources.source_for(url)

        source.rule = FakeRule()
        assert isinstance(source, datasources.common_crawl.CommonCrawlSource)
        start = datetime(2009, 9, 17, 00)
        eq_(source.earliest_record_time(), start)
        end = start + timedelta(days=1)

        urls = source.segment_between(start, end)
        # note the mocks3 connection doesn't bother to actually sign the urls
        self.assertSequenceEqual(urls, [
            'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_0.arc.gz?',
            'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_1.arc.gz?',
            'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_2.arc.gz?',
        ])

        # day's without data should return an empty list
        urls = source.segment_between(datetime(2009, 9, 21, 00),
                                      datetime(2009, 9, 22, 00))
        self.assertSequenceEqual(urls, [])

        urls = source.segment_between(datetime(2009, 10, 01, 00),
                                      datetime(2009, 10, 02, 00))
        self.assertSequenceEqual(urls, [
            'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/10/01/0/1253228619531_0.arc.gz?'
        ])
  def test_sourcing_common_crawl_2012_metadata(self):
    # shut boto's debug messaging up during tests
    logging.getLogger('boto').setLevel(logging.INFO)
 
    with patch('boto.connect_s3', lambda a1,a2:self.s3_mock()):
      datasources.load()
      url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment?metadata'
      source = datasources.source_for(url)

      source.rule = FakeRule()
      eq_(source.__class__, datasources.common_crawl.CommonCrawl2012MetadataSource)
      start = datetime(2012, 7, 7, 19, 42, 27, 253000)

      eq_(source.earliest_record_time(), start)
      end = start + timedelta(days=1)

      urls = source.segment_between(start,end)

      eq_(len(urls), 7)
      
      # day's without data should return an empty list
      urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00))
      self.assertSequenceEqual(
        urls,
        []
      )
Esempio n. 3
0
 def test_segments_between(self):
   url = 's3://AKIAIOV23F6ZNL5YPRNA:8Gwz48zgzwoYIZv70V4uGDD6%[email protected]/folder'
   
   source = datasources.source_for(url)
   assert isinstance(source, s3.S3Source)
   urls = source.segment_between(datetime(2011,5,31), datetime(2011,6,1))
   eq_(len(urls),2)
   
   # note the mockes3 connection doesn't bother to actually sign the urls
   self.assertSequenceEqual(
     urls,
     [
       'http://trivio.test.s3.amazonaws.com/folder/dt%3D2011-05-31T00%3A00%3A00/doc1.csv?',
       'http://trivio.test.s3.amazonaws.com/folder/dt%3D2011-05-31T00%3A00%3A00/doc2.csv?',
     ]
   )
   
   
   urls = source.segment_between(datetime(2011,6,1), datetime(2011,6,2))
   
   # note the mockes3 connection doesn't bother to actually sign the urls
   self.assertSequenceEqual(
     urls,
     [
       'http://trivio.test.s3.amazonaws.com/folder/dt%3D2011-06-01T00%3A00%3A00/doc3.csv?',
     ] 
   )
   
   eq_(len(urls),1)
Esempio n. 4
0
    def test_sourcing_common_crawl_2012_metadata(self):
        # shut boto's debug messaging up during tests
        logging.getLogger('boto').setLevel(logging.INFO)

        with patch('boto.connect_s3', lambda a1, a2: self.s3_mock()):
            datasources.load()
            url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment?metadata'
            source = datasources.source_for(url)

            source.rule = FakeRule()
            eq_(source.__class__,
                datasources.common_crawl.CommonCrawl2012MetadataSource)
            start = datetime(2012, 7, 7, 19, 42, 27, 253000)

            eq_(source.earliest_record_time(), start)
            end = start + timedelta(days=1)

            urls = source.segment_between(start, end)

            eq_(len(urls), 7)

            # day's without data should return an empty list
            urls = source.segment_between(datetime(2009, 9, 21, 00),
                                          datetime(2009, 9, 22, 00))
            self.assertSequenceEqual(urls, [])
Esempio n. 5
0
 def test_segments_between(self):
   url = 'mock://foo?arg1=a&arg2=b'
   source = datasources.source_for(url)
   assert isinstance(source, mock.MockSource)
   urls = source.segment_between(datetime(2011,5,31), datetime(2011,6,1))
   eq_(len(urls),1)
   eq_(urls[0], url+'#2011-05-31T00:00:00')
Esempio n. 6
0
 def test_segments_between(self):
     url = 'mock://foo?arg1=a&arg2=b'
     source = datasources.source_for(url)
     assert isinstance(source, mock.MockSource)
     urls = source.segment_between(datetime(2011, 5, 31),
                                   datetime(2011, 6, 1))
     eq_(len(urls), 1)
     eq_(urls[0], url + '#2011-05-31T00:00:00')
Esempio n. 7
0
 def test_http(self):
   url = 'http://google.com/'
   
   source = datasources.source_for(url)
   assert isinstance(source, HTTPSource)
   urls = source.segment_between(datetime(2011,5,31), datetime(2011,6,1))
   eq_(len(urls),1)
 
   params = Params()
   input_stream = datasources.input_stream_for(None, None, urls[0], params)
Esempio n. 8
0
    def test_http(self):
        url = 'http://google.com/'

        source = datasources.source_for(url)
        assert isinstance(source, HTTPSource)
        urls = source.segment_between(datetime(2011, 5, 31),
                                      datetime(2011, 6, 1))
        eq_(len(urls), 1)

        params = Params()
        input_stream = datasources.input_stream_for(None, None, urls[0],
                                                    params)
  def test_sourcing_common_crawl_2012(self):
    # shut boto's debug messaging up during tests
    logging.getLogger('boto').setLevel(logging.INFO)
   
    with patch('boto.connect_s3', lambda a1,a2:self.s3_mock()):
      datasources.load()
      url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment/'
      source = datasources.source_for(url)

      source.rule = FakeRule()
      eq_(source.__class__, datasources.common_crawl.CommonCrawl2012Source)
      
      start = datetime(2012, 7, 7, 19, 42, 27, 253000)
      eq_(source.earliest_record_time(), start)
      end = start + timedelta(days=1)

      urls = source.segment_between(start,end)
      
      eq_(len(urls), 71)
      
      eq_(
        urls[0],
        'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690147253/1341708194364_11.arc.gz?'
      )
      
      eq_(
        urls[-1],
        'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690150308/1341690944267_36.arc.gz?'
      )
      
      
      # day's without data should return an empty list
      urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00))
      self.assertSequenceEqual(
        urls,
        []
      )

      urls = source.segment_between(datetime(2012,9,07, 00), datetime(2012,9,30, 23))      
      eq_(len(urls), 10)
      
      
      # we should see everything if we query for the whole year
      urls = source.segment_between(datetime(2012,1,01, 00), datetime(2012,12,31, 23))      
      eq_(len(urls), 81)
Esempio n. 10
0
    def test_sourcing_common_crawl_2012(self):
        # shut boto's debug messaging up during tests
        logging.getLogger('boto').setLevel(logging.INFO)

        with patch('boto.connect_s3', lambda a1, a2: self.s3_mock()):
            datasources.load()
            url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment/'
            source = datasources.source_for(url)

            source.rule = FakeRule()
            eq_(source.__class__,
                datasources.common_crawl.CommonCrawl2012Source)

            start = datetime(2012, 7, 7, 19, 42, 27, 253000)
            eq_(source.earliest_record_time(), start)
            end = start + timedelta(days=1)

            urls = source.segment_between(start, end)

            eq_(len(urls), 71)

            eq_(
                urls[0],
                'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690147253/1341708194364_11.arc.gz?'
            )

            eq_(
                urls[-1],
                'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690150308/1341690944267_36.arc.gz?'
            )

            # day's without data should return an empty list
            urls = source.segment_between(datetime(2009, 9, 21, 00),
                                          datetime(2009, 9, 22, 00))
            self.assertSequenceEqual(urls, [])

            urls = source.segment_between(datetime(2012, 9, 07, 00),
                                          datetime(2012, 9, 30, 23))
            eq_(len(urls), 10)

            # we should see everything if we query for the whole year
            urls = source.segment_between(datetime(2012, 1, 01, 00),
                                          datetime(2012, 12, 31, 23))
            eq_(len(urls), 81)
Esempio n. 11
0
    def test_segments_between(self):
        # mimic disco's global "Task" object

        url = 'repo://'
        source = datasources.source_for(url)
        assert isinstance(source, repo.RepoSource)

        source.rule = FakeRule()

        # todo: scan url's during sourcing
        # create a scheme that defrences during work

        # note: source_segment is called in pipeline not the workers

        urls = source.segment_between(datetime(2011, 5, 31),
                                      datetime(2011, 6, 1))
        self.assertSequenceEqual(urls, [
            'repo://dir1/doc1.txt',
            'repo://dir1/doc2.txt',
            'repo://dir2/doc1.txt',
        ])
  def test_sourcing_common_crawl_2010(self):
    # shut boto's debug messaging up during tests
    logging.getLogger('boto').setLevel(logging.INFO)
    

    datasources.load()
    url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/crawl-002/'
    source = datasources.source_for(url)

    source.rule = FakeRule()
    assert isinstance(source, datasources.common_crawl.CommonCrawlSource)
    start = datetime(2009,9,17, 00)
    eq_(source.earliest_record_time(), start)
    end = start + timedelta(days=1)
    
    urls = source.segment_between(start,end)
    # note the mocks3 connection doesn't bother to actually sign the urls
    self.assertSequenceEqual(
      urls,
      [
        'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_0.arc.gz?',
        'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_1.arc.gz?',
        'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_2.arc.gz?',
      ] 
    )
    
    # day's without data should return an empty list
    urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00))
    self.assertSequenceEqual(
      urls,
      []
    )
    
    urls = source.segment_between(datetime(2009,10,01, 00), datetime(2009,10,02, 00))
    self.assertSequenceEqual(
      urls,
      ['http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/10/01/0/1253228619531_0.arc.gz?']
    )
Esempio n. 13
0
  def test_segments_between(self):
    # mimic disco's global "Task" object
    
    url = 'repo://'
    source = datasources.source_for(url)
    assert isinstance(source, repo.RepoSource)
    
    source.rule = FakeRule()
    
    # todo: scan url's during sourcing
    # create a scheme that defrences during work
    
    # note: source_segment is called in pipeline not the workers

    urls = source.segment_between(datetime(2011,5,31), datetime(2011,6,1))
    self.assertSequenceEqual(
      urls,
      [
        'repo://dir1/doc1.txt',
        'repo://dir1/doc2.txt',
        'repo://dir2/doc1.txt',
      ]
    )