class DDFSReadTestCase(DiscoTestCase): def setUp(self): self.ddfs = DDFS(self.disco_master_url) self.ddfs.push('disco:test:blobs', [(StringIO('datablob'), 'blobdata')]) self.ddfs.push('disco:test:blobs', [(StringIO('datablob2'), 'blobdata2')]) self.ddfs.push('disco:test:emptyblob', [(StringIO(''), 'empty')]) self.ddfs.tag('disco:test:tag', [['urls']]) self.ddfs.tag('disco:test:metatag', [['tag://disco:test:tag'], ['tag://disco:test:metatag']]) def test_blobs(self): from os.path import basename blobs = list(self.ddfs.blobs('disco:test:blobs')) self.assert_(basename(blobs[0][0]).startswith('blobdata')) self.assertCommErrorCode(404, lambda: list(self.ddfs.blobs('disco:test:notag', ignore_missing=False))) self.assertEquals(list(self.ddfs.blobs('disco:test:notag')), []) def test_pull(self): self.assertEquals([(self.ddfs.blob_name(url), fd.read()) for fd, sze, url in self.ddfs.pull('disco:test:blobs')], [('blobdata2', 'datablob2'), ('blobdata', 'datablob')]) self.assertEquals([(self.ddfs.blob_name(url), fd.read()) for fd, sze, url in self.ddfs.pull('disco:test:blobs', blobfilter=lambda b: '2' in b)], [('blobdata2', 'datablob2')]) self.assertEquals([(sze, fd.read()) for fd, sze, url in self.ddfs.pull('disco:test:emptyblob')], [(0, '')]) self.assertCommErrorCode(404, self.ddfs.pull('disco:test:notag').next) def test_exists(self): self.assertEquals(self.ddfs.exists('disco:test:tag'), True) self.assertEquals(self.ddfs.exists('disco:test:notag'), False) self.assertEquals(self.ddfs.exists('tag://disco:test:tag'), True) self.assertEquals(self.ddfs.exists('tag://disco:test:notag'), False) def test_findtags(self): list(self.ddfs.findtags(['disco:test:metatag'])) def test_get(self): self.assertCommErrorCode(404, lambda: self.ddfs.get('disco:test:notag')) self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['urls']]) self.assertEquals(self.ddfs.get(['disco:test:tag'])['urls'], [['urls']]) def test_list(self): self.assert_('disco:test:tag' in self.ddfs.list()) self.assert_('disco:test:tag' in self.ddfs.list('disco:test')) self.assertEquals(self.ddfs.list('disco:test:notag'), []) def test_walk(self): list(self.ddfs.walk('disco:test:tag')) def tearDown(self): self.ddfs.delete('disco:test:blobs') self.ddfs.delete('disco:test:emptyblob') self.ddfs.delete('disco:test:tag') self.ddfs.delete('disco:test:metatag')
def delete_all(): ''' Deletes all tags in DDFS, thus orphaning all blobs and making them subject to eventual removal by the garbage collector. ''' ddfs = DDFS() for tag in ddfs.list(): ddfs.delete(tag)
class DdfsGcTests(TestCase): def setUp(self): self.d = DDFS() wait_for_gc_to_finish(self.d) with open(FILE, 'w') as f: print >>f, "hello world!" def _test_push(self, prefix, func): for i in range(COUNT): func(prefix + str(i), [FILE]) self.d._download(self.d.master + "/ddfs/ctrl/gc_start") wait_for_gc_to_finish(self.d) for i in range(COUNT): blobs = [b for b in self.d.blobs(prefix + str(i))] self.assertEquals(len(blobs), 1) self.assertGreater(len(blobs[0]), 0) def test_push_deterministic(self): self._test_push(PREFIX + str(uuid1()), self.d.push) def test_push_same_tag(self): self._test_push(PREFIX, self.d.push) def test_chunk_deterministic(self): self._test_push(PREFIX + str(uuid1()), self.d.chunk) def test_chunk_same_tag(self): self._test_push(PREFIX, self.d.chunk) def test_chunk_delayed(self): self._test_push(PREFIX, partial(self.d.chunk, delayed=True)) def test_push_delayed(self): self._test_push(PREFIX, partial(self.d.push, delayed=True)) def test_chunk_none_replicas(self): self._test_push(PREFIX, partial(self.d.chunk, replicas=None)) def _test_func_tag(self, prefix, func): def chunk_tag(name, input): _, blob_set = func(name, input) self.d.tag(name + "tag", blob_set) self._test_push(PREFIX, chunk_tag) for i in range(COUNT): blobs = [b for b in self.d.blobs(prefix + str(i) + "tag")] self.assertEquals(len(blobs), 1) self.assertGreater(len(blobs[0]), 0) def test_chunk_tag(self): self._test_func_tag(PREFIX, self.d.chunk) def test_chunk_tag_delayed(self): self._test_func_tag(PREFIX, partial(self.d.chunk, delayed=True)) def test_push_tag(self): self._test_func_tag(PREFIX, self.d.push) def test_push_tag_delayed(self): self._test_func_tag(PREFIX, partial(self.d.push, delayed=True)) def tearDown(self): tags = self.d.list(PREFIX) for tag in tags: self.d.delete(tag)
def list_by_tag(tag): """List all blobs pushed to DDFS by tag""" ddfs = DDFS() return ddfs.list(tag)
class DdfsGcTests(TestCase): def setUp(self): self.d = DDFS() wait_for_gc_to_finish(self.d) with open(FILE, 'w') as f: print >> f, "hello world!" def _test_push(self, prefix, func): for i in range(COUNT): func(prefix + str(i), [FILE]) self.d._download(self.d.master + "/ddfs/ctrl/gc_start") wait_for_gc_to_finish(self.d) for i in range(COUNT): blobs = [b for b in self.d.blobs(prefix + str(i))] self.assertEquals(len(blobs), 1) self.assertGreater(len(blobs[0]), 0) def test_push_deterministic(self): self._test_push(PREFIX + str(uuid1()), self.d.push) def test_push_same_tag(self): self._test_push(PREFIX, self.d.push) def test_chunk_deterministic(self): self._test_push(PREFIX + str(uuid1()), self.d.chunk) def test_chunk_same_tag(self): self._test_push(PREFIX, self.d.chunk) def test_chunk_delayed(self): self._test_push(PREFIX, partial(self.d.chunk, delayed=True)) def test_push_delayed(self): self._test_push(PREFIX, partial(self.d.push, delayed=True)) def test_chunk_none_replicas(self): self._test_push(PREFIX, partial(self.d.chunk, replicas=None)) def _test_func_tag(self, prefix, func): def chunk_tag(name, input): _, blob_set = func(name, input) self.d.tag(name + "tag", blob_set) self._test_push(PREFIX, chunk_tag) for i in range(COUNT): blobs = [b for b in self.d.blobs(prefix + str(i) + "tag")] self.assertEquals(len(blobs), 1) self.assertGreater(len(blobs[0]), 0) def test_chunk_tag(self): self._test_func_tag(PREFIX, self.d.chunk) def test_chunk_tag_delayed(self): self._test_func_tag(PREFIX, partial(self.d.chunk, delayed=True)) def test_push_tag(self): self._test_func_tag(PREFIX, self.d.push) def test_push_tag_delayed(self): self._test_func_tag(PREFIX, partial(self.d.push, delayed=True)) def tearDown(self): tags = self.d.list(PREFIX) for tag in tags: self.d.delete(tag)
cur_date = datetime.strptime(cur_date, "%Y-%m-%d") + timedelta(days=1) cur_date = cur_date.strftime("%Y-%m-%d") return days if __name__ == "__main__": from twitter_filter import TweetFilter from disco.core import result_iterator from disco.ddfs import DDFS import sys start_day = sys.argv[1] end_day = sys.argv[2] keyword_file = sys.argv[3] ddfs = DDFS() days = get_days(start_day, end_day) tags = [] for day in days: tags = tags + ddfs.list("enriched:%s" % day) job_name = "Tweet_filter" params = json.load(open(keyword_file)) inputs = [("tag://%s") % tag for tag in tags] print "Days[%d], Files[%d]" % (len(days), len(inputs)) job = TweetFilter().run(input=inputs, partitions=len(days), params=params, name=job_name) result = job.wait(show=False) out_file = "filtered_tweet_company.txt" with open(out_file, "w") as ow: for k, v in result_iterator(result): ow.write(v + "\n")