Exemple #1
0
    def __init__(self, *args, **kwargs):
        # load the defaults
        super(Settings, self).update(defaults)

        # override with the settings file
        path = kwargs.get('settings_file') or self['settings_file']
        if path and os.path.exists(path):
            try:
                import yaml
                self.update(yaml.load(open(path)))
            except:
                pass  # if ya can't ya can't

        # final overrides
        super(Settings, self).update(overrides)
        super(Settings, self).__init__(*args, **kwargs)

        # set up ddfs and disco
        if not self['server'].startswith('disco://'):
            self['server'] = 'disco://' + self['server']

        if 'ddfs' not in self:
            self['ddfs'] = DDFS(self['server'])
        self['server'] = Disco(self['server'])

        # set up worker
        if 'worker' not in self:
            worker_mod, _, worker_class = self['worker_class'].rpartition('.')
            mod = __import__(worker_mod, {}, {}, worker_mod)
            self['worker'] = getattr(mod, worker_class)()
Exemple #2
0
def load_oob(host, name, key):
    from disco.ddfs import DDFS
    ddfs = DDFS(host)
    # NB: this assumes that blobs are listed in LIFO order.
    # We want to return the latest version
    for fd in ddfs.pull(ddfs.job_oob(name), blobfilter=lambda x: x == key):
        return fd.read()
def delete_all():
    '''
	Deletes all tags in DDFS, thus orphaning all blobs and making them subject to eventual removal by the garbage collector.
	'''
    ddfs = DDFS()
    for tag in ddfs.list():
        ddfs.delete(tag)
Exemple #4
0
 def setUp(self):
     tag = 'disco:test:authjob'
     self.ddfs = DDFS(self.disco_master_url)
     pushed = self.ddfs.push(tag, [(StringIO('blobdata'), 'blob')])
     self.ddfs.setattr(tag, 'ddfs:read-token', 'r')
     self.input = ['tag://*****:*****@/' + tag]
     super(AuthJobTestCase, self).setUp()
Exemple #5
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
     self.ddfs.push('disco:test:authrd', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.push('disco:test:authwr', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.setattr('disco:test:authrd', 'a', 'v')
     self.ddfs.setattr('disco:test:authwr', 'a', 'v')
     self.ddfs.setattr('disco:test:authrd', 'ddfs:read-token', 'rdr')
     self.ddfs.setattr('disco:test:authwr', 'ddfs:write-token', 'wtr')
Exemple #6
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
     self.ddfs.push('disco:test:blobs', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.push('disco:test:blobs', [(StringIO('datablob2'), 'blobdata2')])
     self.ddfs.push('disco:test:emptyblob', [(StringIO(''), 'empty')])
     self.ddfs.tag('disco:test:tag', [['urls']])
     self.ddfs.tag('disco:test:metatag',
                   [['tag://disco:test:tag'], ['tag://disco:test:metatag']])
Exemple #7
0
def ddfs_save(blobs, name, master):
    from disco.ddfs import DDFS
    ddfs = DDFS(master)
    blobs = [(blob, ('discoblob:%s:%s' % (name, os.path.basename(blob))))
             for blob in blobs]
    tag = ddfs_name(name)
    ddfs.push(tag, blobs, retries=600, delayed=True, update=True)
    return "tag://%s" % tag
Exemple #8
0
def get_disco_handle(server):
    from disco.core import Disco
    from disco.ddfs import DDFS

    if server and not server.startswith('disco://'):
        server = 'disco://' + server

    return Disco(server), DDFS(server)
def main(file_in="iris.csv", file_out="centers.csv", n_clusters=3):
    # TODO: Rename tag data:kcluster1 if tag exists.
    # Disco v0.4.4 requires that ./ prefix the file to idendify as local file.
    # http://disco.readthedocs.org/en/0.4.4/howto/chunk.html#chunking
    tag = "data:sort"
    DDFS().chunk(tag=tag, urls=['./'+file_in])
    try:
        # Import since slave nodes do not have same namespace as master
        from kcluster_map_reduce import KCluster
        job = KCluster().run(input=[tag], map_reader=chain_reader)
        with open(file_out, 'w') as f_out:
            writer = csv.writer(f_out, quoting=csv.QUOTE_NONNUMERIC)
            for center in result _iterator(job.wait(show=True)):
                writer.writerow([center])
    finally:
        DDFS().delete(tag=tag)
    return None
Exemple #10
0
    def save_outputs(self, jobname, master=None):
        from disco.ddfs import DDFS

        def paths():
            for output in self.outputs.values():
                output.file.close()
                yield output.path

        self.send('OUTPUT', [DDFS(master).save(jobname, paths()), 'tag'])
def push_by_tag(file_paths, tag=None):
    '''
	'''
    ddfs = DDFS()
    if tag is None:
        for file_path in file_paths:
            tag = file_path.split("/")[-1].split(".")[0]
            ddfs.push(tag, [file_path])
    else:
        ddfs.push(tag, file_paths)
Exemple #12
0
def inputexpand(input, label=None, settings=DiscoSettings()):
    from disco.ddfs import DDFS, istag
    if ispartitioned(input) and label is not False:
        return zip(*(parse_dir(i, label=label) for i in iterify(input)))
    if isiterable(input):
        return [inputlist(input, label=label, settings=settings)]
    if istag(input):
        ddfs = DDFS(settings=settings)
        return chainify(blobs for name, tags, blobs in ddfs.findtags(input))
    return [input]
Exemple #13
0
def load(file_in, tag):
    """
    Load file into Disco.
    """

    # If Disco tag exists, delete it.
    # Don't add all-new data to an already existing tag.
    if DDFS().exists(tag=tag):
        print("WARNING: Overwriting Disco tag {tag}.".format(tag=tag),
              file=sys.stderr)
        DDFS().delete(tag=tag)

    # Load data into Disco Distributed File System.
    print("Loading into Disco:\n{file_in}\nunder tag\n{tag}".format(
        file_in=file_in, tag=tag))
    try:
        DDFS().chunk(tag=tag, urls=[os.path.join('./', file_in)])
    except ValueError as err:
        print("ValueError: " + err.message, file=sys.stderr)
        print("File: {file_in}".format(file_in=file_in), file=sys.stderr)

    return None
Exemple #14
0
def urllist(url, partid=None, listdirs=True, ddfs=None):
    from disco.ddfs import DDFS, istag
    if istag(url):
        token = auth_token(url)
        ret = []
        for name, tags, blobs in DDFS(ddfs).findtags(url, token=token):
            ret += blobs
        return ret
    if isiterable(url):
        return [list(url)]
    scheme, netloc, path = urlsplit(url)
    if scheme == 'dir' and listdirs:
        return parse_dir(url, partid=partid)
    return [url]
Exemple #15
0
def main_load(args):
    """
    Stage of main function for loading individual files:
    - Download bz2 file if it doesn't exist.
    - Decompress and partition bz2 file if it doesn't exist.
    - Load data into Disco Distributed File System if it doesn't exist.
    """
    df_bz2urls_filetags = args.df_concat.dropna(subset=['bz2url', 'filetag'])
    # Download bz2 file if it doesn't exist.
    # TODO: parallelize, see "programming python" on threads
    # quick hack: use Popen with wget to download
    for (idx, bz2url, filetag) in df_bz2urls_filetags[['bz2url', 'filetag'
                                                       ]].itertuples():
        fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url))
        if os.path.isfile(fbz2):
            if args.verbose >= 2:
                print(("INFO: Skipping download. File already exists:\n {fbz2}"
                       ).format(fbz2=fbz2))
        else:
            if args.verbose >= 1:
                print(("INFO: Downloading:\n {url}\n to:\n {fout}").format(
                    url=bz2url, fout=fbz2))

            try:
                download(url=bz2url, fout=fbz2)
            except:
                ErrMsg().eprint(err=sys.exc_info())
    # Decompress and partition bz2 file if it doesn't exist.
    # TODO: parallelize, see "programing python" on threads
    # quick hack: use Popen with "bunzip2 --keep" and "grep -oE '.{1,1000}' fname" to partition
    for (idx, bz2url, filetag) in df_bz2urls_filetags[['bz2url', 'filetag'
                                                       ]].itertuples():
        fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url))
        fdecom = os.path.splitext(fbz2)[0]
        if os.path.isfile(fdecom):
            if args.verbose >= 2:
                print(
                    ("INFO: Skipping decompress and partition." +
                     " File already exists:\n {fdecom}").format(fdecom=fdecom))
        else:
            if args.verbose >= 1:
                print(("INFO: Decompressing and partitioning:\n" +
                       " {fbz2}\n to:\n {fout}").format(fbz2=fbz2,
                                                        fout=fdecom))
            try:
                decom_part(fbz2=fbz2, fout=fdecom)
            except:
                ErrMsg().eprint(err=sys.exc_info())
    # Load data into Disco Distributed File System if it doesn't exist.
    cmds = []
    for (idx, bz2url, filetag) in df_bz2urls_filetags[['bz2url', 'filetag'
                                                       ]].itertuples():
        fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url))
        fdecom = os.path.splitext(fbz2)[0]
        if DDFS().exists(tag=filetag):
            if args.verbose >= 2:
                print(("INFO: Skipping Disco upload." +
                       " Tag already exists:\n {tag}.").format(tag=filetag))
        else:
            if args.verbose >= 1:
                print(("INFO: Loading into Disco:\n" +
                       " {fdecom}\n under tag:\n {tag}").format(fdecom=fdecom,
                                                                tag=filetag))
            cmd = ("ddfs chunk {tag} {url}").format(tag=filetag,
                                                    url=os.path.join(
                                                        './', fdecom))
            cmds.append(cmd)
            # TODO: parallelize using Python API rather than system, see "programming python" on threads
            # try: DDFS().chunk(tag=filetag, urls=[os.path.join('./', fdecom)])
    try:
        processes = [Popen(cmd, shell=True) for cmd in cmds]
        for proc in processes:
            proc.wait()
    except:
        ErrMsg().eprint(err=sys.exc_info())
    return None
Exemple #16
0
 def ddfs(self):
     from disco.ddfs import DDFS
     return DDFS(settings=self.settings)
Exemple #17
0
    else:
        # print url, rest
        fle = util.localize(rest,
                            disco_data=worker.Task.disco_data,
                            ddfs_data=worker.Task.ddfs_data)

        yield url, fle


def copy_tags_map((url, local_file), params):
    from disco.ddfs import DDFS
    from disco.comm import request
    from tempfile import NamedTemporaryFile
    from socket import gethostname
    try:
        ddfs = DDFS(params.target_disco_master)
        if params.chunk:
            ddfs.chunk(params.target_tag, [local_file])
        else:
            ddfs.push(params.target_tag, [local_file])
        print "pushed local: %s" % local_file
    except Exception as e:
        # we couldn't push the local file for whatever reason, let's try downloading the URL, then pushing
        try:
            blob_req = request('GET', url)
            with NamedTemporaryFile("w", delete=True) as fd:
                fd.write(blob_req.read())
                fd.flush()
                ddfs = DDFS(params.target_disco_master)
                if params.chunk:
                    ddfs.chunk(params.target_tag, [fd.name])
Exemple #18
0
def save_oob(host, name, key, value, ddfs_token=None):
    from disco.ddfs import DDFS
    DDFS(host).push(ddfs_oobname(name), [(StringIO(value), key)], delayed=True)
Exemple #19
0
 def ddfs(self):
     from disco.ddfs import DDFS
     return DDFS(master=self.settings['DISCO_MASTER'])
Exemple #20
0
def writetoken(program, tag, tok):
    """Usage: [-t token] tag token

    Set the write token of a tag.
    """
    program.ddfs.setattr(tag, 'ddfs:write-token', tok, token=program.options.token)

@DDFS.command
def xcat(program, *urls):
    """Usage: [-i] [-p] [-R reader] [-t token] [urls ...]

    Concatenate the extracted results stored in url[s] and print to stdout.
    If any of the url[s] are tags,
    the blobs reachable from the tags will be printed after any non-tag url[s].
    """
    from disco.core import RecordIter
    from disco.util import iterify, reify

    tags, urls = program.separate_tags(*urls)
    stream = reify(program.options.stream)
    reader = program.options.reader
    reader = reify('disco.func.chain_reader' if reader is None else reader)

    for record in RecordIter(chain(urls, program.blobs(*tags)),
                             input_stream=stream,
                             reader=reader):
        print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()

if __name__ == '__main__':
    DDFS(option_parser=DDFSOptionParser()).main()
Exemple #21
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
     self.ddfs.push('disco:test:attrs', [(StringIO('datablob'), 'blobdata')])
     self.ddfs.setattr('disco:test:attrs', 'a1', 'v1')
     self.ddfs.setattr('disco:test:attrs', 'a2', 'v2')
Exemple #22
0
 def setUp(self):
     self.ddfs = DDFS(self.disco_master_url)
Exemple #23
0
from discodex import settings
from discodex.mapreduce import (Indexer, DiscoDBIterator)
from discodex.objects import (DataSet, IChunks, Indices, Index, Results, Dict)

from disco.core import Disco
from disco.ddfs import DDFS
from disco.error import DiscoError
from disco.util import flatten, parse_dir

discodex_settings = settings.DiscodexSettings()
disco_master_url = discodex_settings['DISCODEX_DISCO_MASTER']
disco_prefix = discodex_settings['DISCODEX_DISCO_PREFIX']
index_prefix = discodex_settings['DISCODEX_INDEX_PREFIX']
purge_file = discodex_settings['DISCODEX_PURGE_FILE']
disco_master = Disco(disco_master_url)
ddfs = DDFS(disco_master_url)

NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead'


class IndexCollection(Collection):
    allowed_methods = ('GET', 'POST')

    def delegate(self, request, *args, **kwargs):
        name = str(kwargs.pop('name'))
        return IndexResource(name)(request, *args, **kwargs)

    @property
    def names(self):
        return ddfs.list(index_prefix)
Exemple #24
0
 def ddfs(self):
     return DDFS(settings=self.settings)
Exemple #25
0
 def setUp(self):
     self.d = DDFS()
     wait_for_gc_to_finish(self.d)
     with open(FILE, 'w') as f:
         print >> f, "hello world!"
Exemple #26
0
def main_sets(args):
    """
    Stage of main function for packing individual files into data sets.
    - Sort filetags by size in descending order.
    - Add filetags to a dataset as long as they can fit.
    - Label the dataset with the actual dataset size.
    - Append data to settag from filetags in DDFS.
    - Note: Must have all 'filetag' loaded.
    """
    df_bz2urls_filetags = args.df_concat.dropna(subset=['bz2url', 'filetag'])
    bytes_per_gb = 10**9
    filetag_sizegb_map = {}
    # If it exists, use checked, downloaded data from Disco to verify dataset sizes,
    # otherwise use decompressed files prior to Disco upload.
    if args.check_filetags:
        # idx variables are unused.
        for (idx, bz2url,
             filetag) in df_bz2urls_filetags[['bz2url',
                                              'filetag']].itertuples():
            ftag = os.path.join(args.data_dir, filetag + '.txt')
            ftag_sizegb = os.path.getsize(ftag) / bytes_per_gb
            filetag_sizegb_map[filetag] = ftag_sizegb
    else:
        # idx variables are unused.
        for (idx, bz2url,
             filetag) in df_bz2urls_filetags[['bz2url',
                                              'filetag']].itertuples():
            fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url))
            fdecom = os.path.splitext(fbz2)[0]
            fdecom_sizegb = os.path.getsize(fdecom) / bytes_per_gb
            filetag_sizegb_map[filetag] = fdecom_sizegb
    # Sort filetags by size in descending order.
    # Add filetags to a dataset as long as they can fit. Nest the data sets.
    filetag_sizegb_sorted = sorted(filetag_sizegb_map.iteritems(),
                                   key=operator.itemgetter(1),
                                   reverse=True)
    settag_filetags_map = {}
    is_first = True
    for size in sorted(args.sets_gb):
        filetags = []
        tot = 0.
        res = size
        # Include smaller data sets in the next larger dataset.
        if not is_first:
            filetags.extend(settag_filetags_map[prev_settag])
            tot += prev_tot
            res -= prev_tot
        for (filetag, sizegb) in filetag_sizegb_sorted:
            if (sizegb <= res) and (filetag not in filetags):
                filetags.append(filetag)
                tot += sizegb
                res -= sizegb
        # Label the dataset with the actual dataset size.
        # Note: Disco tags must have character class [A-Za-z0-9_\-@:]+ else get CommError.
        settag = ("{tot:.2f}GB".format(tot=tot)).replace('.', '-')
        settag_filetags_map[settag] = filetags
        # Include the smaller data set in the next larger dataset.
        prev_tot = tot
        prev_settag = settag
        is_first = False
    # Append data to settag from filetags in DDFS.
    # TODO: use logging.
    for settag in sorted(settag_filetags_map):
        if DDFS().exists(tag=settag):
            if args.verbose >= 2:
                print(("INFO: Skipping Disco upload." +
                       " Tag already exists:\n {tag}.").format(tag=settag))
        else:
            if args.verbose >= 1:
                print(
                    ("INFO: Appending data to settag from filetags:\n" +
                     " {settag}\n" + " {filetags}").format(
                         settag=settag, filetags=settag_filetags_map[settag]))
            for filetag in settag_filetags_map[settag]:
                try:
                    filetag_urls = DDFS().urls(filetag)
                    DDFS().tag(settag, filetag_urls)
                except:
                    ErrMsg().eprint(err=sys.exc_info())
    return None
Exemple #27
0
def save_oob(host, name, key, value, ddfs_token=None):
    from disco.ddfs import DDFS
    DDFS(host).push(DDFS.job_oob(name), [(BytesIO(value), key)], delayed=True)
Exemple #28
0
 def _map_input_stream(fd, size, url, params):
     from disco.ddfs import DDFS
     tag = params or 'disco:chunks:%s' % Task.jobname
     yield url, DDFS(Task.master).chunk(tag, [url])
Exemple #29
0
 def ddfs(self):
     from disco.ddfs import DDFS
     return DDFS(self.master)
Exemple #30
0
 def _map_input_stream(fd, size, url, params):
     from disco.ddfs import DDFS
     from disco.func import gzip_line_reader
     tag = params or 'disco:chunks:%s' % Task.jobname
     yield urlo, DDFS(Task.master).chunk(tag, [url], reader=gzip_line_reader)