def chunk_testdata(): import discomll from disco import ddfs path = "/".join(discomll.__file__.split("/")[:-2] + ["discomll", "datasets", ""]) tags_chunk = ["test:breast_cancer_cont", "test:breast_cancer_cont_test", "test:breast_cancer_disc", "test:breast_cancer_disc_test", "test:ex3", "test:ex3_test", "test:ex4", "test:iris", "test:iris_test", "test:regression_data1", "test:regression_data2", "test:regression_data_test1", "test:regression_data_test2"] filenames_chunk = ["breast_cancer_wisconsin_cont.txt", "breast_cancer_wisconsin_cont_test.txt", "breast_cancer_wisconsin_disc.txt", "breast_cancer_wisconsin_disc_test.txt", "ex3.txt", "ex3_test.txt", "ex4.txt", "iris.txt", "iris_test.txt", "regression_data1.txt", "regression_data2.txt", "regression_data_test1.txt", "regression_data_test2.txt"] ddfs = ddfs.DDFS() for i in range(len(tags_chunk)): f = open(path + filenames_chunk[i], "r") print f.name ddfs.chunk(tags_chunk[i], [f.name])
fle = util.localize(rest, disco_data=worker.Task.disco_data, ddfs_data=worker.Task.ddfs_data) yield url, fle def copy_tags_map((url, local_file), params): from disco.ddfs import DDFS from disco.comm import request from tempfile import NamedTemporaryFile from socket import gethostname try: ddfs = DDFS(params.target_disco_master) if params.chunk: ddfs.chunk(params.target_tag, [local_file]) else: ddfs.push(params.target_tag, [local_file]) print "pushed local: %s" % local_file except Exception as e: # we couldn't push the local file for whatever reason, let's try downloading the URL, then pushing try: blob_req = request('GET', url) with NamedTemporaryFile("w", delete=True) as fd: fd.write(blob_req.read()) fd.flush() ddfs = DDFS(params.target_disco_master) if params.chunk: ddfs.chunk(params.target_tag, [fd.name]) else: ddfs.push(params.target_tag, [fd.name])