def test_sourcing_common_crawl_2012_metadata(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) with patch('boto.connect_s3', lambda a1,a2:self.s3_mock()): datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment?metadata' source = datasources.source_for(url) source.rule = FakeRule() eq_(source.__class__, datasources.common_crawl.CommonCrawl2012MetadataSource) start = datetime(2012, 7, 7, 19, 42, 27, 253000) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start,end) eq_(len(urls), 7) # day's without data should return an empty list urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00)) self.assertSequenceEqual( urls, [] )
def test_sourcing_common_crawl_2010(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/crawl-002/' source = datasources.source_for(url) source.rule = FakeRule() assert isinstance(source, datasources.common_crawl.CommonCrawlSource) start = datetime(2009, 9, 17, 00) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start, end) # note the mocks3 connection doesn't bother to actually sign the urls self.assertSequenceEqual(urls, [ 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_0.arc.gz?', 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_1.arc.gz?', 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_2.arc.gz?', ]) # day's without data should return an empty list urls = source.segment_between(datetime(2009, 9, 21, 00), datetime(2009, 9, 22, 00)) self.assertSequenceEqual(urls, []) urls = source.segment_between(datetime(2009, 10, 01, 00), datetime(2009, 10, 02, 00)) self.assertSequenceEqual(urls, [ 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/10/01/0/1253228619531_0.arc.gz?' ])
def test_sourcing_common_crawl_2012_metadata(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) with patch('boto.connect_s3', lambda a1, a2: self.s3_mock()): datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment?metadata' source = datasources.source_for(url) source.rule = FakeRule() eq_(source.__class__, datasources.common_crawl.CommonCrawl2012MetadataSource) start = datetime(2012, 7, 7, 19, 42, 27, 253000) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start, end) eq_(len(urls), 7) # day's without data should return an empty list urls = source.segment_between(datetime(2009, 9, 21, 00), datetime(2009, 9, 22, 00)) self.assertSequenceEqual(urls, [])
def map_input_stream(stream, size, url, params): from disco.util import schemesplit import disco.func from triv.io import datasources, task datasources.load() task.push(Task) input_stream = datasources.input_stream_for(stream, size, url, params) if input_stream: # Note: Task is a global set by disco, we push it onto the context stap # which will allow it to be imported by the modules that need it return input_stream else: # we don't handle the given url, see if vanilla disco moduels can task.pop() # this is normally cleared when we're done iterating return disco.func.map_input_stream(stream,size,url,params)
def map_input_stream(stream, size, url, params): from disco.util import schemesplit import disco.func from triv.io import datasources, task datasources.load() task.push(Task) input_stream = datasources.input_stream_for(stream, size, url, params) if input_stream: # Note: Task is a global set by disco, we push it onto the context stap # which will allow it to be imported by the modules that need it return input_stream else: # we don't handle the given url, see if vanilla disco moduels can task.pop() # this is normally cleared when we're done iterating return disco.func.map_input_stream(stream, size, url, params)
def test_sourcing_common_crawl_2012(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) with patch('boto.connect_s3', lambda a1,a2:self.s3_mock()): datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment/' source = datasources.source_for(url) source.rule = FakeRule() eq_(source.__class__, datasources.common_crawl.CommonCrawl2012Source) start = datetime(2012, 7, 7, 19, 42, 27, 253000) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start,end) eq_(len(urls), 71) eq_( urls[0], 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690147253/1341708194364_11.arc.gz?' ) eq_( urls[-1], 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690150308/1341690944267_36.arc.gz?' ) # day's without data should return an empty list urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00)) self.assertSequenceEqual( urls, [] ) urls = source.segment_between(datetime(2012,9,07, 00), datetime(2012,9,30, 23)) eq_(len(urls), 10) # we should see everything if we query for the whole year urls = source.segment_between(datetime(2012,1,01, 00), datetime(2012,12,31, 23)) eq_(len(urls), 81)
def test_sourcing_common_crawl_2012(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) with patch('boto.connect_s3', lambda a1, a2: self.s3_mock()): datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/parse-output/segment/' source = datasources.source_for(url) source.rule = FakeRule() eq_(source.__class__, datasources.common_crawl.CommonCrawl2012Source) start = datetime(2012, 7, 7, 19, 42, 27, 253000) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start, end) eq_(len(urls), 71) eq_( urls[0], 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690147253/1341708194364_11.arc.gz?' ) eq_( urls[-1], 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/parse-output/segment/1341690150308/1341690944267_36.arc.gz?' ) # day's without data should return an empty list urls = source.segment_between(datetime(2009, 9, 21, 00), datetime(2009, 9, 22, 00)) self.assertSequenceEqual(urls, []) urls = source.segment_between(datetime(2012, 9, 07, 00), datetime(2012, 9, 30, 23)) eq_(len(urls), 10) # we should see everything if we query for the whole year urls = source.segment_between(datetime(2012, 1, 01, 00), datetime(2012, 12, 31, 23)) eq_(len(urls), 81)
def test_sourcing_common_crawl_2010(self): # shut boto's debug messaging up during tests logging.getLogger('boto').setLevel(logging.INFO) datasources.load() url = 's3://bogus-key:bogus-secrect@aws-publicdatasets/common-crawl/crawl-002/' source = datasources.source_for(url) source.rule = FakeRule() assert isinstance(source, datasources.common_crawl.CommonCrawlSource) start = datetime(2009,9,17, 00) eq_(source.earliest_record_time(), start) end = start + timedelta(days=1) urls = source.segment_between(start,end) # note the mocks3 connection doesn't bother to actually sign the urls self.assertSequenceEqual( urls, [ 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_0.arc.gz?', 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_1.arc.gz?', 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/09/17/0/1253228619531_2.arc.gz?', ] ) # day's without data should return an empty list urls = source.segment_between(datetime(2009,9,21, 00), datetime(2009,9,22, 00)) self.assertSequenceEqual( urls, [] ) urls = source.segment_between(datetime(2009,10,01, 00), datetime(2009,10,02, 00)) self.assertSequenceEqual( urls, ['http://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-002/2009/10/01/0/1253228619531_0.arc.gz?'] )
def setUp(self): datasources.load()