def __init__(self, *args, **kwargs): # load the defaults super(Settings, self).update(defaults) # override with the settings file path = kwargs.get('settings_file') or self['settings_file'] if path and os.path.exists(path): try: import yaml self.update(yaml.load(open(path))) except: pass # if ya can't ya can't # final overrides super(Settings, self).update(overrides) super(Settings, self).__init__(*args, **kwargs) # set up ddfs and disco if not self['server'].startswith('disco://'): self['server'] = 'disco://' + self['server'] if 'ddfs' not in self: self['ddfs'] = DDFS(self['server']) self['server'] = Disco(self['server']) # set up worker if 'worker' not in self: worker_mod, _, worker_class = self['worker_class'].rpartition('.') mod = __import__(worker_mod, {}, {}, worker_mod) self['worker'] = getattr(mod, worker_class)()
def load_oob(host, name, key): from disco.ddfs import DDFS ddfs = DDFS(host) # NB: this assumes that blobs are listed in LIFO order. # We want to return the latest version for fd in ddfs.pull(ddfs.job_oob(name), blobfilter=lambda x: x == key): return fd.read()
def delete_all(): ''' Deletes all tags in DDFS, thus orphaning all blobs and making them subject to eventual removal by the garbage collector. ''' ddfs = DDFS() for tag in ddfs.list(): ddfs.delete(tag)
def setUp(self): tag = 'disco:test:authjob' self.ddfs = DDFS(self.disco_master_url) pushed = self.ddfs.push(tag, [(StringIO('blobdata'), 'blob')]) self.ddfs.setattr(tag, 'ddfs:read-token', 'r') self.input = ['tag://*****:*****@/' + tag] super(AuthJobTestCase, self).setUp()
def setUp(self): self.ddfs = DDFS(self.disco_master_url) self.ddfs.push('disco:test:authrd', [(StringIO('datablob'), 'blobdata')]) self.ddfs.push('disco:test:authwr', [(StringIO('datablob'), 'blobdata')]) self.ddfs.setattr('disco:test:authrd', 'a', 'v') self.ddfs.setattr('disco:test:authwr', 'a', 'v') self.ddfs.setattr('disco:test:authrd', 'ddfs:read-token', 'rdr') self.ddfs.setattr('disco:test:authwr', 'ddfs:write-token', 'wtr')
def setUp(self): self.ddfs = DDFS(self.disco_master_url) self.ddfs.push('disco:test:blobs', [(StringIO('datablob'), 'blobdata')]) self.ddfs.push('disco:test:blobs', [(StringIO('datablob2'), 'blobdata2')]) self.ddfs.push('disco:test:emptyblob', [(StringIO(''), 'empty')]) self.ddfs.tag('disco:test:tag', [['urls']]) self.ddfs.tag('disco:test:metatag', [['tag://disco:test:tag'], ['tag://disco:test:metatag']])
def ddfs_save(blobs, name, master): from disco.ddfs import DDFS ddfs = DDFS(master) blobs = [(blob, ('discoblob:%s:%s' % (name, os.path.basename(blob)))) for blob in blobs] tag = ddfs_name(name) ddfs.push(tag, blobs, retries=600, delayed=True, update=True) return "tag://%s" % tag
def get_disco_handle(server): from disco.core import Disco from disco.ddfs import DDFS if server and not server.startswith('disco://'): server = 'disco://' + server return Disco(server), DDFS(server)
def main(file_in="iris.csv", file_out="centers.csv", n_clusters=3): # TODO: Rename tag data:kcluster1 if tag exists. # Disco v0.4.4 requires that ./ prefix the file to idendify as local file. # http://disco.readthedocs.org/en/0.4.4/howto/chunk.html#chunking tag = "data:sort" DDFS().chunk(tag=tag, urls=['./'+file_in]) try: # Import since slave nodes do not have same namespace as master from kcluster_map_reduce import KCluster job = KCluster().run(input=[tag], map_reader=chain_reader) with open(file_out, 'w') as f_out: writer = csv.writer(f_out, quoting=csv.QUOTE_NONNUMERIC) for center in result _iterator(job.wait(show=True)): writer.writerow([center]) finally: DDFS().delete(tag=tag) return None
def save_outputs(self, jobname, master=None): from disco.ddfs import DDFS def paths(): for output in self.outputs.values(): output.file.close() yield output.path self.send('OUTPUT', [DDFS(master).save(jobname, paths()), 'tag'])
def push_by_tag(file_paths, tag=None): ''' ''' ddfs = DDFS() if tag is None: for file_path in file_paths: tag = file_path.split("/")[-1].split(".")[0] ddfs.push(tag, [file_path]) else: ddfs.push(tag, file_paths)
def inputexpand(input, label=None, settings=DiscoSettings()): from disco.ddfs import DDFS, istag if ispartitioned(input) and label is not False: return zip(*(parse_dir(i, label=label) for i in iterify(input))) if isiterable(input): return [inputlist(input, label=label, settings=settings)] if istag(input): ddfs = DDFS(settings=settings) return chainify(blobs for name, tags, blobs in ddfs.findtags(input)) return [input]
def load(file_in, tag): """ Load file into Disco. """ # If Disco tag exists, delete it. # Don't add all-new data to an already existing tag. if DDFS().exists(tag=tag): print("WARNING: Overwriting Disco tag {tag}.".format(tag=tag), file=sys.stderr) DDFS().delete(tag=tag) # Load data into Disco Distributed File System. print("Loading into Disco:\n{file_in}\nunder tag\n{tag}".format( file_in=file_in, tag=tag)) try: DDFS().chunk(tag=tag, urls=[os.path.join('./', file_in)]) except ValueError as err: print("ValueError: " + err.message, file=sys.stderr) print("File: {file_in}".format(file_in=file_in), file=sys.stderr) return None
def urllist(url, partid=None, listdirs=True, ddfs=None): from disco.ddfs import DDFS, istag if istag(url): token = auth_token(url) ret = [] for name, tags, blobs in DDFS(ddfs).findtags(url, token=token): ret += blobs return ret if isiterable(url): return [list(url)] scheme, netloc, path = urlsplit(url) if scheme == 'dir' and listdirs: return parse_dir(url, partid=partid) return [url]
def main_load(args): """ Stage of main function for loading individual files: - Download bz2 file if it doesn't exist. - Decompress and partition bz2 file if it doesn't exist. - Load data into Disco Distributed File System if it doesn't exist. """ df_bz2urls_filetags = args.df_concat.dropna(subset=['bz2url', 'filetag']) # Download bz2 file if it doesn't exist. # TODO: parallelize, see "programming python" on threads # quick hack: use Popen with wget to download for (idx, bz2url, filetag) in df_bz2urls_filetags[['bz2url', 'filetag' ]].itertuples(): fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url)) if os.path.isfile(fbz2): if args.verbose >= 2: print(("INFO: Skipping download. File already exists:\n {fbz2}" ).format(fbz2=fbz2)) else: if args.verbose >= 1: print(("INFO: Downloading:\n {url}\n to:\n {fout}").format( url=bz2url, fout=fbz2)) try: download(url=bz2url, fout=fbz2) except: ErrMsg().eprint(err=sys.exc_info()) # Decompress and partition bz2 file if it doesn't exist. # TODO: parallelize, see "programing python" on threads # quick hack: use Popen with "bunzip2 --keep" and "grep -oE '.{1,1000}' fname" to partition for (idx, bz2url, filetag) in df_bz2urls_filetags[['bz2url', 'filetag' ]].itertuples(): fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url)) fdecom = os.path.splitext(fbz2)[0] if os.path.isfile(fdecom): if args.verbose >= 2: print( ("INFO: Skipping decompress and partition." + " File already exists:\n {fdecom}").format(fdecom=fdecom)) else: if args.verbose >= 1: print(("INFO: Decompressing and partitioning:\n" + " {fbz2}\n to:\n {fout}").format(fbz2=fbz2, fout=fdecom)) try: decom_part(fbz2=fbz2, fout=fdecom) except: ErrMsg().eprint(err=sys.exc_info()) # Load data into Disco Distributed File System if it doesn't exist. cmds = [] for (idx, bz2url, filetag) in df_bz2urls_filetags[['bz2url', 'filetag' ]].itertuples(): fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url)) fdecom = os.path.splitext(fbz2)[0] if DDFS().exists(tag=filetag): if args.verbose >= 2: print(("INFO: Skipping Disco upload." + " Tag already exists:\n {tag}.").format(tag=filetag)) else: if args.verbose >= 1: print(("INFO: Loading into Disco:\n" + " {fdecom}\n under tag:\n {tag}").format(fdecom=fdecom, tag=filetag)) cmd = ("ddfs chunk {tag} {url}").format(tag=filetag, url=os.path.join( './', fdecom)) cmds.append(cmd) # TODO: parallelize using Python API rather than system, see "programming python" on threads # try: DDFS().chunk(tag=filetag, urls=[os.path.join('./', fdecom)]) try: processes = [Popen(cmd, shell=True) for cmd in cmds] for proc in processes: proc.wait() except: ErrMsg().eprint(err=sys.exc_info()) return None
def ddfs(self): from disco.ddfs import DDFS return DDFS(settings=self.settings)
else: # print url, rest fle = util.localize(rest, disco_data=worker.Task.disco_data, ddfs_data=worker.Task.ddfs_data) yield url, fle def copy_tags_map((url, local_file), params): from disco.ddfs import DDFS from disco.comm import request from tempfile import NamedTemporaryFile from socket import gethostname try: ddfs = DDFS(params.target_disco_master) if params.chunk: ddfs.chunk(params.target_tag, [local_file]) else: ddfs.push(params.target_tag, [local_file]) print "pushed local: %s" % local_file except Exception as e: # we couldn't push the local file for whatever reason, let's try downloading the URL, then pushing try: blob_req = request('GET', url) with NamedTemporaryFile("w", delete=True) as fd: fd.write(blob_req.read()) fd.flush() ddfs = DDFS(params.target_disco_master) if params.chunk: ddfs.chunk(params.target_tag, [fd.name])
def save_oob(host, name, key, value, ddfs_token=None): from disco.ddfs import DDFS DDFS(host).push(ddfs_oobname(name), [(StringIO(value), key)], delayed=True)
def ddfs(self): from disco.ddfs import DDFS return DDFS(master=self.settings['DISCO_MASTER'])
def writetoken(program, tag, tok): """Usage: [-t token] tag token Set the write token of a tag. """ program.ddfs.setattr(tag, 'ddfs:write-token', tok, token=program.options.token) @DDFS.command def xcat(program, *urls): """Usage: [-i] [-p] [-R reader] [-t token] [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from disco.core import RecordIter from disco.util import iterify, reify tags, urls = program.separate_tags(*urls) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) for record in RecordIter(chain(urls, program.blobs(*tags)), input_stream=stream, reader=reader): print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip() if __name__ == '__main__': DDFS(option_parser=DDFSOptionParser()).main()
def setUp(self): self.ddfs = DDFS(self.disco_master_url) self.ddfs.push('disco:test:attrs', [(StringIO('datablob'), 'blobdata')]) self.ddfs.setattr('disco:test:attrs', 'a1', 'v1') self.ddfs.setattr('disco:test:attrs', 'a2', 'v2')
def setUp(self): self.ddfs = DDFS(self.disco_master_url)
from discodex import settings from discodex.mapreduce import (Indexer, DiscoDBIterator) from discodex.objects import (DataSet, IChunks, Indices, Index, Results, Dict) from disco.core import Disco from disco.ddfs import DDFS from disco.error import DiscoError from disco.util import flatten, parse_dir discodex_settings = settings.DiscodexSettings() disco_master_url = discodex_settings['DISCODEX_DISCO_MASTER'] disco_prefix = discodex_settings['DISCODEX_DISCO_PREFIX'] index_prefix = discodex_settings['DISCODEX_INDEX_PREFIX'] purge_file = discodex_settings['DISCODEX_PURGE_FILE'] disco_master = Disco(disco_master_url) ddfs = DDFS(disco_master_url) NOT_FOUND, OK, ACTIVE, DEAD = 'unknown job', 'ready', 'active', 'dead' class IndexCollection(Collection): allowed_methods = ('GET', 'POST') def delegate(self, request, *args, **kwargs): name = str(kwargs.pop('name')) return IndexResource(name)(request, *args, **kwargs) @property def names(self): return ddfs.list(index_prefix)
def ddfs(self): return DDFS(settings=self.settings)
def setUp(self): self.d = DDFS() wait_for_gc_to_finish(self.d) with open(FILE, 'w') as f: print >> f, "hello world!"
def main_sets(args): """ Stage of main function for packing individual files into data sets. - Sort filetags by size in descending order. - Add filetags to a dataset as long as they can fit. - Label the dataset with the actual dataset size. - Append data to settag from filetags in DDFS. - Note: Must have all 'filetag' loaded. """ df_bz2urls_filetags = args.df_concat.dropna(subset=['bz2url', 'filetag']) bytes_per_gb = 10**9 filetag_sizegb_map = {} # If it exists, use checked, downloaded data from Disco to verify dataset sizes, # otherwise use decompressed files prior to Disco upload. if args.check_filetags: # idx variables are unused. for (idx, bz2url, filetag) in df_bz2urls_filetags[['bz2url', 'filetag']].itertuples(): ftag = os.path.join(args.data_dir, filetag + '.txt') ftag_sizegb = os.path.getsize(ftag) / bytes_per_gb filetag_sizegb_map[filetag] = ftag_sizegb else: # idx variables are unused. for (idx, bz2url, filetag) in df_bz2urls_filetags[['bz2url', 'filetag']].itertuples(): fbz2 = os.path.join(args.data_dir, os.path.basename(bz2url)) fdecom = os.path.splitext(fbz2)[0] fdecom_sizegb = os.path.getsize(fdecom) / bytes_per_gb filetag_sizegb_map[filetag] = fdecom_sizegb # Sort filetags by size in descending order. # Add filetags to a dataset as long as they can fit. Nest the data sets. filetag_sizegb_sorted = sorted(filetag_sizegb_map.iteritems(), key=operator.itemgetter(1), reverse=True) settag_filetags_map = {} is_first = True for size in sorted(args.sets_gb): filetags = [] tot = 0. res = size # Include smaller data sets in the next larger dataset. if not is_first: filetags.extend(settag_filetags_map[prev_settag]) tot += prev_tot res -= prev_tot for (filetag, sizegb) in filetag_sizegb_sorted: if (sizegb <= res) and (filetag not in filetags): filetags.append(filetag) tot += sizegb res -= sizegb # Label the dataset with the actual dataset size. # Note: Disco tags must have character class [A-Za-z0-9_\-@:]+ else get CommError. settag = ("{tot:.2f}GB".format(tot=tot)).replace('.', '-') settag_filetags_map[settag] = filetags # Include the smaller data set in the next larger dataset. prev_tot = tot prev_settag = settag is_first = False # Append data to settag from filetags in DDFS. # TODO: use logging. for settag in sorted(settag_filetags_map): if DDFS().exists(tag=settag): if args.verbose >= 2: print(("INFO: Skipping Disco upload." + " Tag already exists:\n {tag}.").format(tag=settag)) else: if args.verbose >= 1: print( ("INFO: Appending data to settag from filetags:\n" + " {settag}\n" + " {filetags}").format( settag=settag, filetags=settag_filetags_map[settag])) for filetag in settag_filetags_map[settag]: try: filetag_urls = DDFS().urls(filetag) DDFS().tag(settag, filetag_urls) except: ErrMsg().eprint(err=sys.exc_info()) return None
def save_oob(host, name, key, value, ddfs_token=None): from disco.ddfs import DDFS DDFS(host).push(DDFS.job_oob(name), [(BytesIO(value), key)], delayed=True)
def _map_input_stream(fd, size, url, params): from disco.ddfs import DDFS tag = params or 'disco:chunks:%s' % Task.jobname yield url, DDFS(Task.master).chunk(tag, [url])
def ddfs(self): from disco.ddfs import DDFS return DDFS(self.master)
def _map_input_stream(fd, size, url, params): from disco.ddfs import DDFS from disco.func import gzip_line_reader tag = params or 'disco:chunks:%s' % Task.jobname yield urlo, DDFS(Task.master).chunk(tag, [url], reader=gzip_line_reader)