Ejemplo n.º 1
0
class DDFSReadTestCase(DiscoTestCase):
    def setUp(self):
        self.ddfs = DDFS(self.disco_master_url)
        self.ddfs.push('disco:test:blobs', [(StringIO('datablob'), 'blobdata')])
        self.ddfs.push('disco:test:blobs', [(StringIO('datablob2'), 'blobdata2')])
        self.ddfs.push('disco:test:emptyblob', [(StringIO(''), 'empty')])
        self.ddfs.tag('disco:test:tag', [['urls']])
        self.ddfs.tag('disco:test:metatag',
                      [['tag://disco:test:tag'], ['tag://disco:test:metatag']])

    def test_blobs(self):
        from os.path import basename
        blobs = list(self.ddfs.blobs('disco:test:blobs'))
        self.assert_(basename(blobs[0][0]).startswith('blobdata'))
        self.assertCommErrorCode(404,
                                 lambda: list(self.ddfs.blobs('disco:test:notag',
                                                         ignore_missing=False)))
        self.assertEquals(list(self.ddfs.blobs('disco:test:notag')), [])

    def test_pull(self):
        self.assertEquals([(self.ddfs.blob_name(url), fd.read())
                           for fd, sze, url in self.ddfs.pull('disco:test:blobs')],
                          [('blobdata2', 'datablob2'), ('blobdata', 'datablob')])
        self.assertEquals([(self.ddfs.blob_name(url), fd.read())
                           for fd, sze, url in self.ddfs.pull('disco:test:blobs',
                                                              blobfilter=lambda b: '2' in b)],
                          [('blobdata2', 'datablob2')])
        self.assertEquals([(sze, fd.read()) for fd, sze, url in
                           self.ddfs.pull('disco:test:emptyblob')], [(0, '')])
        self.assertCommErrorCode(404, self.ddfs.pull('disco:test:notag').next)

    def test_exists(self):
        self.assertEquals(self.ddfs.exists('disco:test:tag'), True)
        self.assertEquals(self.ddfs.exists('disco:test:notag'), False)
        self.assertEquals(self.ddfs.exists('tag://disco:test:tag'), True)
        self.assertEquals(self.ddfs.exists('tag://disco:test:notag'), False)

    def test_findtags(self):
        list(self.ddfs.findtags(['disco:test:metatag']))

    def test_get(self):
        self.assertCommErrorCode(404, lambda: self.ddfs.get('disco:test:notag'))
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['urls']])
        self.assertEquals(self.ddfs.get(['disco:test:tag'])['urls'], [['urls']])

    def test_list(self):
        self.assert_('disco:test:tag' in self.ddfs.list())
        self.assert_('disco:test:tag' in self.ddfs.list('disco:test'))
        self.assertEquals(self.ddfs.list('disco:test:notag'), [])

    def test_walk(self):
        list(self.ddfs.walk('disco:test:tag'))

    def tearDown(self):
        self.ddfs.delete('disco:test:blobs')
        self.ddfs.delete('disco:test:emptyblob')
        self.ddfs.delete('disco:test:tag')
        self.ddfs.delete('disco:test:metatag')
Ejemplo n.º 2
0
    def __tag_results(self, results):
        from disco.ddfs import DDFS
        ddfs = DDFS()
        results_tag = results[0]
        ddfs.put(self.docset.ddfs_link_file_tag, list(ddfs.blobs(results_tag)))

        # remove old, temporary tag
        ddfs.delete(results_tag)
Ejemplo n.º 3
0
class DDFSUpdateTestCase(DiscoTestCase):
    data = StringIO('blobdata')

    def setUp(self):
        self.ddfs = DDFS(self.disco_master_url)

    def blobnames(self, tag):
        return list(reversed(list(DDFS.blob_name(repl[0])
                                  for repl in self.ddfs.blobs(tag))))

    def test_update(self):
        for i in range(5):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, 'dup')] * 2,
                           update=True)
        self.assertEquals(len(self.blobnames('disco:test:blobs')), 1)
        for i in range(5):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, 'dup2')],
                           update=True,
                           delayed=True)
        self.assertEquals(len(self.blobnames('disco:test:blobs')), 2)
        self.ddfs.delete('disco:test:blobs')

    def test_random(self):
        import random
        keys = [str(random.randint(1, 100)) for i in range(1000)]
        ukeys = []
        for key in keys:
            self.ddfs.push('disco:test:blobs', [(self.data, key)], update=True)
            if key not in ukeys:
                ukeys.append(key)
        self.assertEquals(ukeys, self.blobnames('disco:test:blobs'))
        self.ddfs.delete('disco:test:blobs')

    def test_mixed(self):
        keys = []
        for key in map(str, range(10)):
            self.ddfs.push('disco:test:blobs', [(self.data, key)] * 2)
            keys += [key] * 2
        for key in map(str, range(15)):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, key)] * 2,
                           update=True)
            if int(key) > 9:
                keys.append(key)
        for key in map(str, range(10)):
            self.ddfs.push('disco:test:blobs',
                           [(self.data, key)] * 2,
                           delayed=True)
            keys += [key] * 2
        self.assertEquals(keys, self.blobnames('disco:test:blobs'))
        self.ddfs.delete('disco:test:blobs')

    def tearDown(self):
        self.ddfs.delete('disco:test:blobs')
Ejemplo n.º 4
0
class DDFSWriteTestCase(DiscoTestCase):
    def setUp(self):
        self.ddfs = DDFS(self.disco_master_url)

    def test_chunk(self):
        from disco.core import RecordIter
        url = 'http://discoproject.org/media/text/chekhov.txt'
        self.ddfs.chunk('disco:test:chunk', [url], chunk_size=100*1024)
        self.assert_(0 < len(list(self.ddfs.blobs('disco:test:chunk'))) <= 4)
        self.assert_(list(RecordIter(['tag://disco:test:chunk'])),
                     list(RecordIter([url], reader=None)))
        self.ddfs.delete('disco:test:chunk')

    def test_push(self):
        self.ddfs.push('disco:test:blobs', [(StringIO('blobdata'), 'blobdata')])
        self.assert_(self.ddfs.exists('disco:test:blobs'))
        self.ddfs.push('tag://disco:test:blobs2', [(StringIO('blobdata'), 'blobdata')])
        self.assert_(self.ddfs.exists('disco:test:blobs2'))
        self.ddfs.delete('disco:test:blobs')
        self.assert_(not self.ddfs.exists('disco:test:blobs'))
        self.ddfs.delete('disco:test:blobs2')
        self.assert_(not self.ddfs.exists('disco:test:blobs2'))

    def test_tag(self):
        self.ddfs.tag('disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('disco:test:tag'))
        self.ddfs.delete('disco:test:tag')
        self.assert_(not self.ddfs.exists('disco:test:tag'))
        self.ddfs.tag('tag://disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('tag://disco:test:tag'))
        self.ddfs.tag('disco:test:tag', [['more_urls']])
        self.assertEquals(sorted(self.ddfs.get('disco:test:tag')['urls']),
                          sorted([['urls'], ['more_urls']]))
        self.ddfs.delete('tag://disco:test:tag')
        self.assert_(not self.ddfs.exists('tag://disco:test:tag'))

    def test_put(self):
        self.ddfs.put('disco:test:tag', [['urls']])
        self.assert_(self.ddfs.exists('disco:test:tag'))
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['urls']])
        self.ddfs.put('disco:test:tag', [['tags']])
        self.assertEquals(self.ddfs.get('disco:test:tag')['urls'], [['tags']])
        self.ddfs.delete('tag://disco:test:tag')

    def test_delete(self):
        self.ddfs.delete('disco:test:notag')

    def tearDown(self):
        self.ddfs.delete('disco:test:notag')
        self.ddfs.delete('disco:test:tag')
        self.ddfs.delete('disco:test:blobs')
        self.ddfs.delete('disco:test:blobs2')
Ejemplo n.º 5
0
class DdfsGcTests(TestCase):
    def setUp(self):
        self.d = DDFS()
        wait_for_gc_to_finish(self.d)
        with open(FILE, 'w') as f:
            print >>f, "hello world!"

    def _test_push(self, prefix, func):
        for i in range(COUNT):
            func(prefix + str(i), [FILE])
        self.d._download(self.d.master + "/ddfs/ctrl/gc_start")

        wait_for_gc_to_finish(self.d)
        for i in range(COUNT):
            blobs = [b for b in self.d.blobs(prefix + str(i))]
            self.assertEquals(len(blobs), 1)
            self.assertGreater(len(blobs[0]), 0)

    def test_push_deterministic(self):
        self._test_push(PREFIX + str(uuid1()), self.d.push)

    def test_push_same_tag(self):
        self._test_push(PREFIX, self.d.push)

    def test_chunk_deterministic(self):
        self._test_push(PREFIX + str(uuid1()), self.d.chunk)

    def test_chunk_same_tag(self):
        self._test_push(PREFIX, self.d.chunk)

    def test_chunk_delayed(self):
        self._test_push(PREFIX, partial(self.d.chunk, delayed=True))

    def test_push_delayed(self):
        self._test_push(PREFIX, partial(self.d.push, delayed=True))

    def test_chunk_none_replicas(self):
        self._test_push(PREFIX, partial(self.d.chunk, replicas=None))

    def _test_func_tag(self, prefix, func):
        def chunk_tag(name, input):
            _, blob_set = func(name, input)
            self.d.tag(name + "tag", blob_set)
        self._test_push(PREFIX, chunk_tag)

        for i in range(COUNT):
            blobs = [b for b in self.d.blobs(prefix + str(i) + "tag")]
            self.assertEquals(len(blobs), 1)
            self.assertGreater(len(blobs[0]), 0)

    def test_chunk_tag(self):
        self._test_func_tag(PREFIX, self.d.chunk)

    def test_chunk_tag_delayed(self):
        self._test_func_tag(PREFIX, partial(self.d.chunk, delayed=True))

    def test_push_tag(self):
        self._test_func_tag(PREFIX, self.d.push)

    def test_push_tag_delayed(self):
        self._test_func_tag(PREFIX, partial(self.d.push, delayed=True))

    def tearDown(self):
        tags = self.d.list(PREFIX)
        for tag in tags:
            self.d.delete(tag)
Ejemplo n.º 6
0
class DdfsGcTests(TestCase):
    def setUp(self):
        self.d = DDFS()
        wait_for_gc_to_finish(self.d)
        with open(FILE, 'w') as f:
            print >> f, "hello world!"

    def _test_push(self, prefix, func):
        for i in range(COUNT):
            func(prefix + str(i), [FILE])
        self.d._download(self.d.master + "/ddfs/ctrl/gc_start")

        wait_for_gc_to_finish(self.d)
        for i in range(COUNT):
            blobs = [b for b in self.d.blobs(prefix + str(i))]
            self.assertEquals(len(blobs), 1)
            self.assertGreater(len(blobs[0]), 0)

    def test_push_deterministic(self):
        self._test_push(PREFIX + str(uuid1()), self.d.push)

    def test_push_same_tag(self):
        self._test_push(PREFIX, self.d.push)

    def test_chunk_deterministic(self):
        self._test_push(PREFIX + str(uuid1()), self.d.chunk)

    def test_chunk_same_tag(self):
        self._test_push(PREFIX, self.d.chunk)

    def test_chunk_delayed(self):
        self._test_push(PREFIX, partial(self.d.chunk, delayed=True))

    def test_push_delayed(self):
        self._test_push(PREFIX, partial(self.d.push, delayed=True))

    def test_chunk_none_replicas(self):
        self._test_push(PREFIX, partial(self.d.chunk, replicas=None))

    def _test_func_tag(self, prefix, func):
        def chunk_tag(name, input):
            _, blob_set = func(name, input)
            self.d.tag(name + "tag", blob_set)

        self._test_push(PREFIX, chunk_tag)

        for i in range(COUNT):
            blobs = [b for b in self.d.blobs(prefix + str(i) + "tag")]
            self.assertEquals(len(blobs), 1)
            self.assertGreater(len(blobs[0]), 0)

    def test_chunk_tag(self):
        self._test_func_tag(PREFIX, self.d.chunk)

    def test_chunk_tag_delayed(self):
        self._test_func_tag(PREFIX, partial(self.d.chunk, delayed=True))

    def test_push_tag(self):
        self._test_func_tag(PREFIX, self.d.push)

    def test_push_tag_delayed(self):
        self._test_func_tag(PREFIX, partial(self.d.push, delayed=True))

    def tearDown(self):
        tags = self.d.list(PREFIX)
        for tag in tags:
            self.d.delete(tag)
Ejemplo n.º 7
0
class Docset(object):
    """
    A `Docset` represents a set of documents, contained in dump files stored on
    DDFS. Class instantiation alone doesn't do anything to DDFS; the DDFS tag
    for this docset won't exist until a dump is added.
    """

    def __init__(self, docset_name):
        self.ddfs_tag = docset_name
        self.ddfs_index_tag = docset_name + ':index'
        self.ddfs_link_file_tag = docset_name + ':links'
        self.ddfs = DDFS()
        self.__index = None
        self.dirty = False

    def exists(self):
        """Returns True if this Docset exists in DDFS."""
        return self.ddfs.exists(self.ddfs_tag)
        
    def delete(self):
        """
        Deletes this tag from DDFS. DDFS garbage collection will soon take care
        of dumps in this docset with no other tags. If other docsets link to
        this docset's dumps, then those dumps will remain.
        """
        self.ddfs.delete(self.ddfs_index_tag)
        self.ddfs.delete(self.ddfs_tag)

    INDEX_VERSION_PAD = 4
    @property
    def index(self):
        # Lazily load index data from DDFS.
        if self.__index is None:
            blobs = [uri for (uri,) in self.ddfs.blobs(self.ddfs_index_tag)]
            if len(blobs) == 0:
                self.__index = {}
                self.__index_version = 0
            else:
                # Find blob with highest version number.
                ver, discouri = sorted([(self.__blob_uri_to_dump_name(uri), uri)
                                        for uri in blobs], reverse=True)[0]
                uri = urlresolve(discouri)
                data = urllib2.urlopen(uri).read()
                try:
                    self.__index = pickle.loads(data)
                    self.__index_version = int(ver)
                except EOFError:
                    raise EOFError("EOF reading docset index at %s in tag %s" % \
                                       (uri, self.ddfs_index_tag))
        return self.__index

    def save(self):
        self.index # load if hasn't been loaded yet
        self.__index_version += 1
        ver = "%0*d" % (self.INDEX_VERSION_PAD, self.__index_version)
        tmp_fname = os.path.join("/tmp/", "%s%s" % (self.ddfs_index_tag, ver))
        with open(tmp_fname, 'w+b') as f:
            pickle.dump(self.__index, f)
            f.flush()
            f.seek(0)
            self.ddfs.push(self.ddfs_index_tag, [(f, ver)])
        self.dirty = False

    def add_dump(self, dumpname, dump):
        """
        Adds a dump to this docset and indexes its documents by position,
        uploading the dump to DDFS with the tag for this docset.
        """
        # index positions
        startpos = 0
        endpos = None
        with open(dump, 'rb') as f:
            dociter = WARCParser(f)
            for doc in dociter:
                endpos = dociter.tell()
                self.index[doc.uri] = (dumpname, startpos, endpos - startpos)
                startpos = endpos
        self.ddfs.push(self.ddfs_tag, [(dump, dumpname)])
        self.dirty = True

    @property
    def doc_count(self):
        """
        Returns the total number of documents contained in all dumps in this
        docset.
        """
        return len(self.index)

    def doc_uris(self):
        """Returns all URIs of documents contained in all dumps in this
        docset."""
        return self.index.keys()

    def dump_uris(self):
        """
        Returns disco:// URIs for each dump in the docset. Use
        disco.util.urlresolve to convert the disco:// URIs to http:// URIs.
        """
        return (uri for (uri,) in self.ddfs.blobs(self.ddfs_tag))

    def __blob_uri_to_dump_name(self, bloburi):
        """
        Takes a blob URI like
           disco://host/ddfs/vol0/blob/b4/dumpname$4fd-ea750-6d4e1
        and returns "dumpname".
        """
        return re.search(r'/([\w0-9_\-@:]+)\$', bloburi).group(1)

    def __dump_name_to_blob_uri(self, dumpname):
        """
        Takes a dump name like "dumpname" and returns the blob URI like
           disco://host/ddfs/vol0/blob/b4/dumpname$000-11111-fffff
        """
        for uri in self.dump_uris():
            if dumpname == self.__blob_uri_to_dump_name(uri):
                return uri
        raise KeyError
    
    def dump_names(self):
        """Returns the names of dumps in the docset."""
        return [self.__blob_uri_to_dump_name(uri) for uri in self.dump_uris()]

    def get_pos(self, uri):
        """Returns a tuple `(dump_name, byte pos)` of the location of the
        document `uri` in the docset."""
        if uri in self.index:
            return self.index[uri]
        else:
            raise DocumentNotFound()
        
    def get(self, uri):
        """Returns the `Document` with the specified `uri`."""
        name, startpos, size = self.get_pos(uri)
        try:
            dump_uri = urlresolve(self.__dump_name_to_blob_uri(name))
        except KeyError:
            raise DocumentNotFound("couldn't find doc with dump name '%s'" % name)

        req = urllib2.Request(dump_uri)
        req.add_header("Range", "bytes=%d-%d" % (startpos, startpos + size - 1))
        res = urllib2.urlopen(req)
        return WARCParser(res).next()
Ejemplo n.º 8
0
 def runTest(self):
     results = sorted(list(self.results))
     ddfs = DDFS(self.disco_master_url)
     tag = self.disco.results(self.job.name)[1][0]
     self.assertEquals(len(list(ddfs.blobs(tag))), len(self.inputs))
     self.assertEquals(self.answers, results)