Beispiel #1
0
class DDFSReadTestCase(DiscoTestCase):
    def setUp(self):
        self.ddfs = DDFS(self.disco_master_url)
        self.ddfs.push('disco:test:blobs', [(StringIO('datablob'), 'blobdata')])
        self.ddfs.push('disco:test:blobs', [(StringIO('datablob2'), 'blobdata2')])
        self.ddfs.push('disco:test:emptyblob', [(StringIO(''), 'empty')])
        self.ddfs.tag('disco:test:tag', [['urls']])
        self.ddfs.tag('disco:test:metatag',
                      [['tag://disco:test:tag'], ['tag://disco:test:metatag']])

    def test_blobs(self):
        from os.path import basename
        blobs = list(self.ddfs.blobs('disco:test:blobs'))
        self.assert_(basename(blobs[0][0]).startswith('blobdata'))
        self.assertCommErrorCode(404,
                                 lambda: list(self.ddfs.blobs('disco:test:notag',
                                                         ignore_missing=False)))
        self.assertEquals(list(self.ddfs.blobs('disco:test:notag')), [])

    def test_pull(self):
        self.assertEquals([(self.ddfs.blob_name(url), fd.read())
                           for fd, sze, url in self.ddfs.pull('disco:test:blobs')],
                          [('blobdata2', 'datablob2'), ('blobdata', 'datablob')])
        self.assertEquals([(self.ddfs.blob_name(url), fd.read())
                           for fd, sze, url in self.ddfs.pull('disco:test:blobs',
                                                              blobfilter=lambda b: '2' in b)],
                          [('blobdata2', 'datablob2')])
        self.assertEquals([(sze, fd.read()) for fd, sze, url in
                           self.ddfs.pull('disco:test:emptyblob')], [(0, '')])
        self.assertCommErrorCode(404, self.ddfs.pull('disco:test:notag').next)

    def test_exists(self):
        self.assertEquals(self.ddfs.exists('disco:test:tag'), True)
        self.assertEquals(self.ddfs.exists('disco:test:notag'), False)
        self.assertEquals(self.ddfs.exists('tag://disco:test:tag'), True)
        self.assertEquals(self.ddfs.exists('tag://disco:test:notag'), False)

    def test_findtags(self):
        list(self.ddfs.findtags(['disco:test:metatag']))

    def test_get(self):
        self.assertCommErrorCode(404, lambda: self.ddfs.get('disco:test:notag'))
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['urls']])
        self.assertEquals(self.ddfs.get(['disco:test:tag'])['urls'], [['urls']])

    def test_list(self):
        self.assert_('disco:test:tag' in self.ddfs.list())
        self.assert_('disco:test:tag' in self.ddfs.list('disco:test'))
        self.assertEquals(self.ddfs.list('disco:test:notag'), [])

    def test_walk(self):
        list(self.ddfs.walk('disco:test:tag'))

    def tearDown(self):
        self.ddfs.delete('disco:test:blobs')
        self.ddfs.delete('disco:test:emptyblob')
        self.ddfs.delete('disco:test:tag')
        self.ddfs.delete('disco:test:metatag')
Beispiel #2
0
def delete_all():
	'''
	Deletes all tags in DDFS, thus orphaning all blobs and making them subject to eventual removal by the garbage collector.
	'''
	ddfs = DDFS()
	for tag in ddfs.list():
		ddfs.delete(tag)
def delete_all():
    '''
	Deletes all tags in DDFS, thus orphaning all blobs and making them subject to eventual removal by the garbage collector.
	'''
    ddfs = DDFS()
    for tag in ddfs.list():
        ddfs.delete(tag)
Beispiel #4
0
class DdfsGcTests(TestCase):
    def setUp(self):
        self.d = DDFS()
        wait_for_gc_to_finish(self.d)
        with open(FILE, 'w') as f:
            print >>f, "hello world!"

    def _test_push(self, prefix, func):
        for i in range(COUNT):
            func(prefix + str(i), [FILE])
        self.d._download(self.d.master + "/ddfs/ctrl/gc_start")

        wait_for_gc_to_finish(self.d)
        for i in range(COUNT):
            blobs = [b for b in self.d.blobs(prefix + str(i))]
            self.assertEquals(len(blobs), 1)
            self.assertGreater(len(blobs[0]), 0)

    def test_push_deterministic(self):
        self._test_push(PREFIX + str(uuid1()), self.d.push)

    def test_push_same_tag(self):
        self._test_push(PREFIX, self.d.push)

    def test_chunk_deterministic(self):
        self._test_push(PREFIX + str(uuid1()), self.d.chunk)

    def test_chunk_same_tag(self):
        self._test_push(PREFIX, self.d.chunk)

    def test_chunk_delayed(self):
        self._test_push(PREFIX, partial(self.d.chunk, delayed=True))

    def test_push_delayed(self):
        self._test_push(PREFIX, partial(self.d.push, delayed=True))

    def test_chunk_none_replicas(self):
        self._test_push(PREFIX, partial(self.d.chunk, replicas=None))

    def _test_func_tag(self, prefix, func):
        def chunk_tag(name, input):
            _, blob_set = func(name, input)
            self.d.tag(name + "tag", blob_set)
        self._test_push(PREFIX, chunk_tag)

        for i in range(COUNT):
            blobs = [b for b in self.d.blobs(prefix + str(i) + "tag")]
            self.assertEquals(len(blobs), 1)
            self.assertGreater(len(blobs[0]), 0)

    def test_chunk_tag(self):
        self._test_func_tag(PREFIX, self.d.chunk)

    def test_chunk_tag_delayed(self):
        self._test_func_tag(PREFIX, partial(self.d.chunk, delayed=True))

    def test_push_tag(self):
        self._test_func_tag(PREFIX, self.d.push)

    def test_push_tag_delayed(self):
        self._test_func_tag(PREFIX, partial(self.d.push, delayed=True))

    def tearDown(self):
        tags = self.d.list(PREFIX)
        for tag in tags:
            self.d.delete(tag)
Beispiel #5
0
def list_by_tag(tag):
    """List all blobs pushed to DDFS by tag"""
    ddfs = DDFS()
    return ddfs.list(tag)
Beispiel #6
0
class DdfsGcTests(TestCase):
    def setUp(self):
        self.d = DDFS()
        wait_for_gc_to_finish(self.d)
        with open(FILE, 'w') as f:
            print >> f, "hello world!"

    def _test_push(self, prefix, func):
        for i in range(COUNT):
            func(prefix + str(i), [FILE])
        self.d._download(self.d.master + "/ddfs/ctrl/gc_start")

        wait_for_gc_to_finish(self.d)
        for i in range(COUNT):
            blobs = [b for b in self.d.blobs(prefix + str(i))]
            self.assertEquals(len(blobs), 1)
            self.assertGreater(len(blobs[0]), 0)

    def test_push_deterministic(self):
        self._test_push(PREFIX + str(uuid1()), self.d.push)

    def test_push_same_tag(self):
        self._test_push(PREFIX, self.d.push)

    def test_chunk_deterministic(self):
        self._test_push(PREFIX + str(uuid1()), self.d.chunk)

    def test_chunk_same_tag(self):
        self._test_push(PREFIX, self.d.chunk)

    def test_chunk_delayed(self):
        self._test_push(PREFIX, partial(self.d.chunk, delayed=True))

    def test_push_delayed(self):
        self._test_push(PREFIX, partial(self.d.push, delayed=True))

    def test_chunk_none_replicas(self):
        self._test_push(PREFIX, partial(self.d.chunk, replicas=None))

    def _test_func_tag(self, prefix, func):
        def chunk_tag(name, input):
            _, blob_set = func(name, input)
            self.d.tag(name + "tag", blob_set)

        self._test_push(PREFIX, chunk_tag)

        for i in range(COUNT):
            blobs = [b for b in self.d.blobs(prefix + str(i) + "tag")]
            self.assertEquals(len(blobs), 1)
            self.assertGreater(len(blobs[0]), 0)

    def test_chunk_tag(self):
        self._test_func_tag(PREFIX, self.d.chunk)

    def test_chunk_tag_delayed(self):
        self._test_func_tag(PREFIX, partial(self.d.chunk, delayed=True))

    def test_push_tag(self):
        self._test_func_tag(PREFIX, self.d.push)

    def test_push_tag_delayed(self):
        self._test_func_tag(PREFIX, partial(self.d.push, delayed=True))

    def tearDown(self):
        tags = self.d.list(PREFIX)
        for tag in tags:
            self.d.delete(tag)
        cur_date = datetime.strptime(cur_date, "%Y-%m-%d") + timedelta(days=1)
        cur_date = cur_date.strftime("%Y-%m-%d")
    return days


if __name__ == "__main__":
    from twitter_filter import TweetFilter
    from disco.core import result_iterator
    from disco.ddfs import DDFS
    import sys

    start_day = sys.argv[1]
    end_day = sys.argv[2]
    keyword_file = sys.argv[3]

    ddfs = DDFS()
    days = get_days(start_day, end_day)
    tags = []
    for day in days:
        tags = tags + ddfs.list("enriched:%s" % day)
    job_name = "Tweet_filter"
    params = json.load(open(keyword_file))
    inputs = [("tag://%s") % tag for tag in tags]
    print "Days[%d], Files[%d]" % (len(days), len(inputs))
    job = TweetFilter().run(input=inputs, partitions=len(days), params=params, name=job_name)
    result = job.wait(show=False)
    out_file = "filtered_tweet_company.txt"
    with open(out_file, "w") as ow:
        for k, v in result_iterator(result):
            ow.write(v + "\n")