def test_sourcing_common_crawl_2010(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/crawl-002/' source = datasources.source_for(url) source.rule = FakeRule() assert isinstance(source, datasources.common_crawl.CommonCrawlSource) start = datetime(2009, 9, 17, 00) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start, end) # note the mocks3 connection doesn't bother to actually sign the urls self.assertSequenceEqual(urls, [ 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_0.arc.gz?', 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_1.arc.gz?', 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_2.arc.gz?', ]) # day's without data should return an empty list urls = source.segment_between(datetime(2009, 9, 21, 00), datetime(2009, 9, 22, 00)) self.assertSequenceEqual(urls, []) urls = source.segment_between(datetime(2009, 10, 01, 00), datetime(2009, 10, 02, 00)) self.assertSequenceEqual(urls, [ 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/10/01/0/1253228619531_0.arc.gz?' ])
def test_sourcing_common_crawl_2012_metadata(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) with patch('boto.connect_s3', lambda a1,a2:self.s3_mock()): datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment?metadata' source = datasources.source_for(url) source.rule = FakeRule() eq_(source.__class__, datasources.common_crawl.CommonCrawl2012MetadataSource) start = datetime(2012, 7, 7, 19, 42, 27, 253000) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start,end) eq_(len(urls), 7) # day's without data should return an empty list urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00)) self.assertSequenceEqual( urls, [] )
def test_segments_between(self): url = 's3://AKIAIOV23F6ZNL5YPRNA:8Gwz48zgzwoYIZv70V4uGDD6%[email protected]/folder' source = datasources.source_for(url) assert isinstance(source, s3.S3Source) urls = source.segment_between(datetime(2011,5,31), datetime(2011,6,1)) eq_(len(urls),2) # note the mockes3 connection doesn't bother to actually sign the urls self.assertSequenceEqual( urls, [ 'http://trivio.test.s3.amazonaws.com/folder/dt%3D2011-05-31T00%3A00%3A00/doc1.csv?', 'http://trivio.test.s3.amazonaws.com/folder/dt%3D2011-05-31T00%3A00%3A00/doc2.csv?', ] ) urls = source.segment_between(datetime(2011,6,1), datetime(2011,6,2)) # note the mockes3 connection doesn't bother to actually sign the urls self.assertSequenceEqual( urls, [ 'http://trivio.test.s3.amazonaws.com/folder/dt%3D2011-06-01T00%3A00%3A00/doc3.csv?', ] ) eq_(len(urls),1)
def test_sourcing_common_crawl_2012_metadata(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) with patch('boto.connect_s3', lambda a1, a2: self.s3_mock()): datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment?metadata' source = datasources.source_for(url) source.rule = FakeRule() eq_(source.__class__, datasources.common_crawl.CommonCrawl2012MetadataSource) start = datetime(2012, 7, 7, 19, 42, 27, 253000) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start, end) eq_(len(urls), 7) # day's without data should return an empty list urls = source.segment_between(datetime(2009, 9, 21, 00), datetime(2009, 9, 22, 00)) self.assertSequenceEqual(urls, [])
def test_segments_between(self): url = 'mock://foo?arg1=a&arg2=b' source = datasources.source_for(url) assert isinstance(source, mock.MockSource) urls = source.segment_between(datetime(2011,5,31), datetime(2011,6,1)) eq_(len(urls),1) eq_(urls[0], url+'#2011-05-31T00:00:00')
def test_segments_between(self): url = 'mock://foo?arg1=a&arg2=b' source = datasources.source_for(url) assert isinstance(source, mock.MockSource) urls = source.segment_between(datetime(2011, 5, 31), datetime(2011, 6, 1)) eq_(len(urls), 1) eq_(urls[0], url + '#2011-05-31T00:00:00')
def test_http(self): url = 'http://google.com/' source = datasources.source_for(url) assert isinstance(source, HTTPSource) urls = source.segment_between(datetime(2011,5,31), datetime(2011,6,1)) eq_(len(urls),1) params = Params() input_stream = datasources.input_stream_for(None, None, urls[0], params)
def test_http(self): url = 'http://google.com/' source = datasources.source_for(url) assert isinstance(source, HTTPSource) urls = source.segment_between(datetime(2011, 5, 31), datetime(2011, 6, 1)) eq_(len(urls), 1) params = Params() input_stream = datasources.input_stream_for(None, None, urls[0], params)
def test_sourcing_common_crawl_2012(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) with patch('boto.connect_s3', lambda a1,a2:self.s3_mock()): datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment/' source = datasources.source_for(url) source.rule = FakeRule() eq_(source.__class__, datasources.common_crawl.CommonCrawl2012Source) start = datetime(2012, 7, 7, 19, 42, 27, 253000) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start,end) eq_(len(urls), 71) eq_( urls[0], 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690147253/1341708194364_11.arc.gz?' ) eq_( urls[-1], 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690150308/1341690944267_36.arc.gz?' ) # day's without data should return an empty list urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00)) self.assertSequenceEqual( urls, [] ) urls = source.segment_between(datetime(2012,9,07, 00), datetime(2012,9,30, 23)) eq_(len(urls), 10) # we should see everything if we query for the whole year urls = source.segment_between(datetime(2012,1,01, 00), datetime(2012,12,31, 23)) eq_(len(urls), 81)
def test_sourcing_common_crawl_2012(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) with patch('boto.connect_s3', lambda a1, a2: self.s3_mock()): datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment/' source = datasources.source_for(url) source.rule = FakeRule() eq_(source.__class__, datasources.common_crawl.CommonCrawl2012Source) start = datetime(2012, 7, 7, 19, 42, 27, 253000) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start, end) eq_(len(urls), 71) eq_( urls[0], 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690147253/1341708194364_11.arc.gz?' ) eq_( urls[-1], 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690150308/1341690944267_36.arc.gz?' ) # day's without data should return an empty list urls = source.segment_between(datetime(2009, 9, 21, 00), datetime(2009, 9, 22, 00)) self.assertSequenceEqual(urls, []) urls = source.segment_between(datetime(2012, 9, 07, 00), datetime(2012, 9, 30, 23)) eq_(len(urls), 10) # we should see everything if we query for the whole year urls = source.segment_between(datetime(2012, 1, 01, 00), datetime(2012, 12, 31, 23)) eq_(len(urls), 81)
def test_segments_between(self): # mimic disco's global "Task" object url = 'repo://' source = datasources.source_for(url) assert isinstance(source, repo.RepoSource) source.rule = FakeRule() # todo: scan url's during sourcing # create a scheme that defrences during work # note: source_segment is called in pipeline not the workers urls = source.segment_between(datetime(2011, 5, 31), datetime(2011, 6, 1)) self.assertSequenceEqual(urls, [ 'repo://dir1/doc1.txt', 'repo://dir1/doc2.txt', 'repo://dir2/doc1.txt', ])
def test_sourcing_common_crawl_2010(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/crawl-002/' source = datasources.source_for(url) source.rule = FakeRule() assert isinstance(source, datasources.common_crawl.CommonCrawlSource) start = datetime(2009,9,17, 00) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start,end) # note the mocks3 connection doesn't bother to actually sign the urls self.assertSequenceEqual( urls, [ 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_0.arc.gz?', 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_1.arc.gz?', 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_2.arc.gz?', ] ) # day's without data should return an empty list urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00)) self.assertSequenceEqual( urls, [] ) urls = source.segment_between(datetime(2009,10,01, 00), datetime(2009,10,02, 00)) self.assertSequenceEqual( urls, ['http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/10/01/0/1253228619531_0.arc.gz?'] )
def test_segments_between(self): # mimic disco's global "Task" object url = 'repo://' source = datasources.source_for(url) assert isinstance(source, repo.RepoSource) source.rule = FakeRule() # todo: scan url's during sourcing # create a scheme that defrences during work # note: source_segment is called in pipeline not the workers urls = source.segment_between(datetime(2011,5,31), datetime(2011,6,1)) self.assertSequenceEqual( urls, [ 'repo://dir1/doc1.txt', 'repo://dir1/doc2.txt', 'repo://dir2/doc1.txt', ] )