def test__read_text(c, s, a, b): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f: f.write('Alice 100\nBob 200\nCharlie 300'.encode()) with hdfs.open('%s/text.2.txt' % basedir, 'wb') as f: f.write('Dan 400\nEdith 500\nFrank 600'.encode()) with hdfs.open('%s/other.txt' % basedir, 'wb') as f: f.write('a b\nc d'.encode()) b = db.read_text('hdfs://%s/text.*.txt' % basedir) yield gen.sleep(0.5) assert not s.tasks import dask b.compute(get=dask.get) coll = b.str.strip().str.split().map(len) future = c.compute(coll) yield gen.sleep(0.5) result = yield future._result() assert result == [2, 2, 2, 2, 2, 2] b = db.read_text('hdfs://%s/other.txt' % basedir) b = c.persist(b) future = c.compute(b.str.split().concat()) result = yield future._result() assert result == ['a', 'b', 'c', 'd']
def test__read_text(e, s, a, b): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/text.1.txt', 'wb') as f: f.write('Alice 100\nBob 200\nCharlie 300'.encode()) with hdfs.open('/tmp/test/text.2.txt', 'wb') as f: f.write('Dan 400\nEdith 500\nFrank 600'.encode()) with hdfs.open('/tmp/test/other.txt', 'wb') as f: f.write('a b\nc d'.encode()) b = db.read_text('hdfs:///tmp/test/text.*.txt', collection=True) yield gen.sleep(0.5) assert not s.tasks import dask b.compute(get=dask.get) future = e.compute(b.str.strip().str.split().map(len)) yield gen.sleep(0.5) result = yield future._result() assert result == [2, 2, 2, 2, 2, 2] b = db.read_text('hdfs:///tmp/test/other.txt', collection=True) b = e.persist(b) future = e.compute(b.str.split().concat()) result = yield future._result() assert result == ['a', 'b', 'c', 'd'] L = db.read_text('hdfs:///tmp/test/text.*.txt', collection=False) assert all(isinstance(x, Delayed) for x in L)
def test_read_text(): with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns: assert set(line.strip() for line in db.read_text(fns)) == \ set('ABCD') assert set(line.strip() for line in db.read_text('a*.log')) == \ set('ABCD') assert raises(ValueError, lambda: db.read_text('non-existent-*-path'))
def test_read_text_large_gzip(): with tmpfile('gz') as fn: f = GzipFile(fn, 'wb') f.write(b'Hello, world!\n' * 100) f.close() with pytest.raises(ValueError): b = db.read_text(fn, blocksize=100, linedelimiter='\n') c = db.read_text(fn) assert c.npartitions == 1
def test_read_text_encoding(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write((u'你好!' + os.linesep).encode('gb18030') * 100) b = db.read_text(fn, blocksize=100, encoding='gb18030') c = db.read_text(fn, encoding='gb18030') assert len(b.dask) > 5 assert list(map(lambda x: x.encode('utf-8'), b)) == list(map(lambda x: x.encode('utf-8'), c)) d = db.read_text([fn], blocksize=100, encoding='gb18030') assert list(b) == list(d)
def test_read_text_large(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write(('Hello, world!' + os.linesep).encode() * 100) b = db.read_text(fn, blocksize=100) c = db.read_text(fn) assert len(b.dask) > 5 assert list(map(str, b)) == list(map(str, c)) d = db.read_text([fn], blocksize=100) assert list(b) == list(d)
def test_read_text_large(): with tmpfile() as fn: with open(fn, 'wb') as f: f.write(('Hello, world!' + os.linesep).encode() * 100) b = db.read_text(fn, blocksize=100) c = db.read_text(fn) assert len(b.dask) > 5 assert list(map(str, b.str.strip())) == list(map(str, c.str.strip())) d = db.read_text([fn], blocksize=100) assert list(b) == list(d)
def partition_files(fn="map.osm", force_refresh=False): """ parameters ---------- fn: (str) is the name of file that should exist returns: ------- a dask bag of python dictionaries side-effect: ------- create a single file eg. map_slxml.osm in which every line is a single xml element """ assert (os.path.exists(fn)) fn_prefix = fn.split('.')[0] mod_time_map = os.path.getmtime(fn) mod_time_p1 = os.path.getmtime(fn_prefix + '_partition_0.osm') if os.path.exists( fn_prefix + '_partition_0.osm') else 0 if mod_time_map < mod_time_p1 and not force_refresh: print("partition files still fresh") b = db.read_text(fn_prefix + '_partition_*.osm') else: print("making fresh partition files 9") out_fn = fn_prefix + '_slxml.osm' # create file with one element per line # IMPORTANT: must write text with 'utf-8' otherwise it will choke # trying to write ascii with open(out_fn, 'w', encoding='utf-8') as outputf: for i, e in enumerate(get_element(fn)): # from docs: if encoding is _not_ 'unicode' a byte string will be generated(!) s = ET.tostring(e, encoding='unicode') s1 = ' '.join(s.strip().split()) outputf.write(s1 + '\n') line_count = i num_cores = multiprocessing.cpu_count() partition_length = line_count // num_cores + 1 fsize_bytes = os.path.getsize(out_fn) # create multi-partitioned dask bag from file with 1 element per line # it seems like we are obliged to strip the line ending... b = db.read_text(out_fn, fsize_bytes // num_cores + 1).map(str.strip) # create partitioned files from dask bag b.to_textfiles(fn_prefix + '_partition_*.osm') return b.map(ET.XML).map(element2dict)
def test_read_text_large_gzip(): with tmpfile('gz') as fn: f = GzipFile(fn, 'wb') f.write(b'Hello, world!\n' * 100) f.close() with pytest.raises(ValueError): db.read_text(fn, blocksize=100, linedelimiter='\n') c = db.read_text(fn) assert c.npartitions == 1
def test_read_text_large_gzip(): with tmpfile("gz") as fn: f = GzipFile(fn, "wb") f.write(b"Hello, world!\n" * 100) f.close() with pytest.raises(ValueError): db.read_text(fn, blocksize=50, linedelimiter="\n") c = db.read_text(fn) assert c.npartitions == 1
def test_read_text_encoding(): with tmpfile() as fn: with open(fn, "wb") as f: f.write((u"你好!" + os.linesep).encode("gb18030") * 100) b = db.read_text(fn, blocksize=100, encoding="gb18030") c = db.read_text(fn, encoding="gb18030") assert len(b.dask) > 5 assert list(b.str.strip().map(lambda x: x.encode("utf-8"))) == list( c.str.strip().map(lambda x: x.encode("utf-8"))) d = db.read_text([fn], blocksize=100, encoding="gb18030") assert list(b) == list(d)
def test_read_text_large_gzip(): with tmpfile("gz") as fn: data = b"Hello, world!\n" * 100 f = GzipFile(fn, "wb") f.write(data) f.close() with pytest.raises(ValueError): # not allowed blocks when compressed db.read_text(fn, blocksize=50, linedelimiter="\n") c = db.read_text(fn, blocksize=None) assert c.npartitions == 1 assert "".join(c.compute()) == data.decode()
def execute_queries(settings, query_parameters, output_path): # TOPOLISH: test and doc # Create dask delayed object, make rdkit mol object if possible. if settings['reactant_db']: input_bag = db.read_text(settings['reactant_db'], blocksize=settings["partition_size"]) input_bag = input_bag.map(lambda x: Chem.MolFromSmiles(x)).remove( lambda x: x is None) else: print( 'Warning no reactant database defined, this only works if all reactants are loaded from sequence or file.' ) # Creating a list of product bags and the compute section. reactant_bags = [] for i, query in enumerate(query_parameters): # Query and repartition mol database if query["from_file"]: bag = db.read_text(query['from_file']) bag = bag.map(lambda x: Chem.MolFromSmiles(x)).remove( lambda x: x is None) elif query["from_sequence"]: bag = db.from_sequence(query["from_sequence"]) bag = bag.map(lambda x: Chem.MolFromSmiles(x)).remove( lambda x: x is None) else: bag = construct_query(query, input_bag) reactant_bags.append(bag) # Compute and enumerate. print('Querying database and/or reading input files...\n') with ProgressBar(): reactants_lists = dask.compute(*reactant_bags) # reactants_lists = [common.zfill_enum(i, 6) for i in reactants_lists] reactants_lists = [helpers.enum(i) for i in reactants_lists] # Write reactants to file. print('Writing reactants to file...') written = [] for i, reactants_list in enumerate(reactants_lists): output_file = output_path + f"reactant{str(i).zfill(2)}.smi" written.append(output_file) with open(output_file, 'w') as f: f.write("canonical_smiles\tindex") for line in reactants_list: f.write(f"\n{Chem.MolToSmiles(line[0])}\t{line[1]}") return reactants_lists
def load_data(path, chunks): raw_bag = read_text(path) \ .str.strip() \ .map(row_to_numpy) return da.stack(raw_bag, axis=0)
def combine_files(prefix): """Takes a prefix and finds all files of the form {prefix}[*1].[*2].txt and combines all the files with [*1] into one .txt file with name {prefix}[*1].txt. update 2-03-2020: now also enumerates compounds. Parameters ---------- prefix: str Returns ------- None """ files = glob.glob(f"{prefix}*_*.txt") cluster_prefixes = set(['.'.join(x.split('.')[:-2]) for x in files]) prefix_dict = {} for p in cluster_prefixes: prefix_dict[p] = glob.glob(f"{p}.*.txt") for k, v in prefix_dict.items(): bag = db.read_text(v) with gzip.open(f'{k}.txt.gz', 'wt') as f: lines = [line.strip('\n') for line in bag.compute()] content = ''.join([f'{b}\t{a}\n' for a, b in enumerate(lines)]) f.write(content) [os.remove(x) for x in v]
def test_washing_with_dask(self): """Bit more elaborate of a test to see if rdkit handles a set of molecules in a consistent way in coming versions and to see how dask handles the used chem_functions functions. """ expected = ['CC(C)=CCC/C(C)=C\\CC/C(C)=C\\CO', 'CC12CC(O)C(CC1=O)C2(C)C', 'Oc1cc(C2CCNCC2)on1', 'Cn1ncc2cc(CN)ccc21', 'O=C(O)c1cc(Cl)cs1', 'Cc1cc(CN)ncc1Br', 'CO[C@@H](C)[C@@H](N)C(=O)O', 'Nc1ccc(Br)c(F)c1[N+](=O)[O-]', 'Cc1ccc(F)c(C#N)n1', 'Cc1ccc(F)c(CN)n1'] from rdkit import Chem from MCR import chem_functions import dask.bag as db from rdkit import RDLogger, rdBase rdBase.DisableLog('rdApp.error') RDLogger.DisableLog('rdApp.info') bag = db.read_text("../tests/test_data/test_db.smi", blocksize=16e6) bag = bag.map(lambda x: Chem.MolFromSmiles(x)).filter(lambda x: x is not None) bag = bag.map(chem_functions.remove_salts_mol) bag = bag.map(chem_functions.decharge_mol) bag = bag.map(chem_functions.get_largest_fragment_mol) bag = bag.map(chem_functions.standardize_mol) self.assertEqual([Chem.MolToSmiles(x) for x in bag.take(10)], expected)
def test_bag(): b = db.from_sequence([1, 2, 3, 4, 5, 6], npartitions=2) b.compute() # b = db.read_text('myfile.txt') # b = db.read_text('myfile.*.txt.gz') b = db.read_text('myfile.*.csv').str.strip().str.split(',') # to_textfiles # 将dask Bag写入磁盘,每个分区一个文件名,每个元素一行。 def name(i): return str(date(2015, 1, 1) + i * timedelta(days=1)) b.to_textfiles('/path/to/data/*.json.gz', name_function=name) b = db.from_sequence([{ 'name': 'Alice', 'balance': 100 }, { 'name': 'Bob', 'balance': 200 }, { 'name': 'Charlie', 'balance': 300 }], npartitions=2) df = b.to_dataframe() df.head() pass
def get_files(src): try: b = db.read_text(src).map(json.loads) except json.decoder.JSONDecodeError: print("bork") print(b.take(2))
def extract(indir, outdir, nb_vizdir, context_len): '''Get all dataset records from solution cells.''' logger.info('') def nb_to_rec_format(nb): return {'contents': json.dumps(nb), 'repo': nb['metadata']['repo'], 'path': nb['metadata']['path']} nbs = (db.read_text(indir+'/*.jsonl', blocksize='120mib'). map(json.loads). # for compatibility with extraction code above change back to rec format, so # we can group by repo map(nb_to_rec_format) .to_dataframe()) with ProgressBar(): nb_cells = nbs.groupby('repo').apply( (lambda group: get_solution_cells(group, nb_vizdir, context_len)), meta=object).compute() logger.info('num repos %s', len(nb_cells)) logger.info('num repos at least 1 solution extracted %s', len([c for c in nb_cells if c])) nb_cells = nb_cells.tolist() cells = [c for n in nb_cells for c in n if n and c] logger.info('num extracted solution cells %s', len(cells)) jdumpl(cells, outdir + '/cells.jsonl')
def test_gh715(): bin_data = u'\u20ac'.encode('utf-8') with tmpfile() as fn: with open(fn, 'wb') as f: f.write(bin_data) a = db.read_text(fn) assert a.compute()[0] == bin_data.decode('utf-8')
def read_text(path, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, hdfs=None, lazy=True, collection=True): """ Read text lines from HDFS Parameters ---------- path: string filename or globstring of files on HDFS collection: boolean, optional Whether or not to return a high level collection lazy: boolean, optional Whether or not to start reading immediately Returns ------- Dask bag (if collection=True) or Futures or dask values """ warn("hdfs.read_text moved to dask.bag.read_text('hdfs://...')") import dask.bag as db result = db.read_text('hdfs://' + path, encoding=encoding, errors=errors, linedelimiter=lineterminator, hdfs=hdfs, collection=collection) executor = default_executor(executor) ensure_default_get(executor) if not lazy: if collection: result = executor.persist(result) else: result = executor.compute(result) return result
def read_text(path, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, hdfs=None, lazy=True, collection=True): warn("hdfs.read_text moved to dask.bag.read_text('hdfs://...')") import dask.bag as db result = db.read_text('hdfs://' + path, encoding=encoding, errors=errors, linedelimiter=lineterminator, hdfs=hdfs, collection=collection) executor = default_executor(executor) ensure_default_get(executor) if not lazy: if collection: result = executor.persist(result) else: result = executor.compute(result) return result
def load(self): if self.ext == "json": return db.read_text(self.loc, blocksize=1e7).map(json.loads) elif self.ext == "pickle": return db.from_sequence(glob(self.loc)) else: raise Exception(self.ext + " ChunkedFile not supported")
def test_gh715(): bin_data = u"\u20ac".encode("utf-8") with tmpfile() as fn: with open(fn, "wb") as f: f.write(bin_data) a = db.read_text(fn) assert a.compute()[0] == bin_data.decode("utf-8")
def preprocess(jsonl): """Ingest data and preprocess comment text""" bag = ( db.read_text(jsonl, blocksize="10MiB") .map(json.loads) .map( lambda r: { "created_utc": r["created_utc"], "subreddit": r["subreddit"], "text": regex_replace(r["body"]), } ) ) df = bag.to_dataframe() df = df[df["text"].str.len() > 30] df["created_utc"] = dd.to_datetime(df["created_utc"], unit="s") ## dask and spacy multiprocessing don't play nicely ## nlp.pipe might not be the fastest way to preprocessing df = df.compute() df["tokens"] = tokenize(df["text"].astype("unicode")) df = df[df["tokens"] != ""] df = df.drop("text", axis=1) return df
def test_to_textfiles_name_function_preserves_order(): seq = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", ] b = db.from_sequence(seq, npartitions=16) with tmpdir() as dn: b.to_textfiles(dn) out = (db.read_text(os.path.join( dn, "*"), encoding="ascii").map(str).map(str.strip).compute()) assert seq == out
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--from_dir_prefix') parser.add_argument('-t', '--to_dir_prefix') parser.add_argument('-u', '--path2udp_model', default='russian-syntagrus-ud-2.0-170801.udpipe') parser.add_argument('-n', '--cpu_n', default=5, type=int) args = parser.parse_args() # dfunc.MODELFILE4UDPIPE = args.path2udp_model # dfunc.set_model(args.path2udp_model) with dask.config.set(pool=ThreadPool(args.cpu_n)): bag = db.read_text(args.from_dir_prefix) pbar = ProgressBar() pbar.register() ddf = bag.to_dataframe(columns=['text']) ddf['text'] = ddf['text'].apply(dfunc.skip_empty, meta=('x', 'f8')) ddf = ddf.dropna() ddf['rec'] = ddf['text'].apply(dfunc.get_rec_info, meta=('x', 'f8')) ddf['text'] = ddf['text'].apply(dfunc.spec_tok_add, meta=('x', 'f8')) ddf['norm_text'] = ddf['text'].apply(dfunc.normalization1, meta=('x', 'f8')) # udpipe_sent_and_tok = dfunc.get_udpipe_sent_and_tok(args.path2udp_model) # udpipe_sent_and_tok = dfunc.get_udpipe_sent_and_tok('/home/den/Documents/elmo/data_preparing/rutwitter/russian-syntagrus-ud-2.0-170801.udpipe') ddf['norm_text'] = ddf['text'].apply(dfunc.udpipe_sent_and_tok, meta=('x', 'f8')) # ddf['norm_text'] = ddf['text'].apply(dfunc.nltk_sent_and_tok, meta=('x', 'f8')) ddf['norm_text'] = ddf['norm_text'].apply(dfunc.normalization2, meta=('x', 'f8')) ddf['rec_text'] = ddf.apply(dfunc.recovery, meta=('x', 'f8'), axis=1) ddf['cleaned_text'] = ddf['norm_text'].apply(dfunc.lower_case, meta=('x', 'f8')) ddf[['rec_text', 'cleaned_text']].to_csv(args.to_dir_prefix)
def filter_cells_nl_above_dataframe(cells_indir, cells_outdir, nbs_indir, max_dist): '''Filter cells if markdown is more than max_dist away.''' shutil.rmtree(cells_outdir, ignore_errors=True) Path(cells_outdir).mkdir(exist_ok=True) logging.info(f'Num cells before nl dist %s filter %s', max_dist, db.read_text(cells_indir+'/*.jsonl').count().compute()) # this is required by dask cells_meta= {'cell_type': str, 'execution_count': int, 'metadata': object, 'outputs': object, 'source': str, 'og_cell': object, 'nb_index': int} # nb_index is added by the add_key function so important to have it nb_meta= {'cells': object, 'metadata': object, 'nbformat': int, 'nbformat_minor': int, 'nb_index': int} cells_df = db.read_text(cells_indir +'/*.jsonl').map(json.loads).map(lambda js: add_key(js,cell=True)).to_dataframe(meta=cells_meta) nbs_df = db.read_text(nbs_indir +'/*.jsonl').map(json.loads).map(lambda js: add_key(js)).to_dataframe(meta=nb_meta) def get_cell(row): '''if not nl above return high int representing infinite distance''' cells = row.cells cell_index = row.metadata_cell['cell_index'] reversed_cells_before = reversed(cells[:cell_index]) # iterate over all cells above starting with one directly above for i, c in enumerate(reversed_cells_before): if i+1 <= max_dist and is_markdown(c): return row.og_cell # print(row.og_cell['metadata']['nb_orig_url']) return {} with ProgressBar(minimum=15): (cells_df.merge(nbs_df, on='nb_index', suffixes=['_cell', '_nb']) .apply(get_cell, meta=object, axis=1).to_bag() .filter(lambda js: 'source' in js) .map(json.dumps).to_textfiles(cells_outdir + '/*.jsonl')) logging.info(f'Num cells after nl dist %s filter %s', max_dist, db.read_text(cells_outdir+'/*.jsonl').count().compute())
def get_code_context_records(cells_indir, cells_outdir, nbs_indir, context_len, max_tokens): '''Convert cells into the dataset format where each record will store the context/code pairs.''' logger.info('') shutil.rmtree(cells_outdir, ignore_errors=True) Path(cells_outdir).mkdir(exist_ok=True) # this is required by dask cells_meta = { 'cell_type': str, 'execution_count': int, 'metadata': object, 'outputs': object, 'source': str, 'og_cell': object, 'nb_index': int } # nb_index is added by the add_key function so important to have it nb_meta = { 'cells': object, 'metadata': object, 'nbformat': int, 'nbformat_minor': int, 'nb_index': int } # add nb_index key first to be able to join cells_df = db.read_text(cells_indir + '/*.jsonl').map( json.loads).map(lambda js: add_key(js, cell=True)).to_dataframe( meta=cells_meta) nbs_df = db.read_text(nbs_indir + '/*.jsonl').map( json.loads).map(lambda js: add_key(js)).to_dataframe(meta=nb_meta) with ProgressBar(minimum=15): # join each cell with nb to compute dataset record (cells_df.merge(nbs_df, on='nb_index', suffixes=['_cell', '_nb']).apply(compute_dataset_record, context_len=context_len, max_tokens=max_tokens, meta=object, axis=1).to_bag() # records with len greater than max tokens will be none so we filter for valid records .filter(lambda js: js and js['code_tokens']).map( json.dumps).to_textfiles(cells_outdir + '/*.jsonl'))
def test_to_textfiles_name_function_preserves_order(): seq = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'] b = db.from_sequence(seq, npartitions=16) with tmpdir() as dn: b.to_textfiles(dn) out = db.read_text(os.path.join(dn, "*"), encoding='ascii').map(str).map(str.strip).compute() assert seq == out
def test_read_text_compression(e, s, a, b): import dask.bag as db b = db.read_text('s3://distributed-test/csv/gzip/*', compression='gzip', blocksize=None, storage_options=dict(anon=True)) result = yield e.compute(b)._result() assert result == [line + '\n' for k in sorted(csv_files) for line in csv_files[k].decode().split('\n') if line]
def filter_boilerplate(input_bucket, output_bucket): boilerplate_dataframe_path = '/home/romanell/Downloads/bp.pkl' t = Timer() bp_df = from_pandas(pd.read_pickle(boilerplate_dataframe_path), chunksize=10000).reset_index().persist() nps = list_newspapers(input_bucket) for np in nps: passim_rebuilt_files = fixed_s3fs_glob(f'{os.path.join(input_bucket, np)}/*.bz2') # we want to keep the number of resulting files as the one of input files n_partitions = len(passim_rebuilt_files) print(f'Crunching {np}: {len(passim_rebuilt_files)} files') # detect whether the current item has already been processed existing_files = fixed_s3fs_glob(f'{output_bucket}{np}*.bz2') # skip newspapers that don't need to be processed if np == 'NZZ': print('NZZ, skipping') continue elif len(existing_files) > 0: print(f'{np} already done, move on') continue passim_data_df = ( db.read_text(passim_rebuilt_files, storage_options=IMPRESSO_STORAGEOPT) .map(json.loads) .map(lambda d: {'id': d['id'], 'document': d}) .to_dataframe() .set_index('id') .persist() ) np_bp_df = bp_df[bp_df.id.str.contains(np)].set_index('id').compute() tmp_df = passim_data_df.join(np_bp_df, how='outer') filtered_df = tmp_df[tmp_df.is_boilerplate.isnull()] output_files = [ f'{output_bucket}{np}-{str(n+1).zfill(4)}.jsonl.bz2' for n, f in enumerate(passim_rebuilt_files) ] future = ( filtered_df.reset_index() .to_bag() .map(lambda i: i[1]) .map(json.dumps) .repartition(n_partitions) .to_textfiles(output_files, storage_options=IMPRESSO_STORAGEOPT) ) print(f'Written {len(output_files)} output files; first five: {output_files[:5]}') print(f'Done with {np}. It took: {t.tick()}') print('------------------------------------')
def test_read_text_sync(loop): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/data.txt', 'wb') as f: f.write(b'hello\nworld') with cluster(nworkers=3) as (s, [a, b, c]): with Executor(('127.0.0.1', s['port']), loop=loop): b = db.read_text('hdfs:///tmp/test/*.txt') assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
def test_from_s3(): # note we don't test connection modes with aws_access_key and # aws_secret_key because these are not on travis-ci pytest.importorskip('s3fs') five_tips = (u'total_bill,tip,sex,smoker,day,time,size\n', u'16.99,1.01,Female,No,Sun,Dinner,2\n', u'10.34,1.66,Male,No,Sun,Dinner,3\n', u'21.01,3.5,Male,No,Sun,Dinner,3\n', u'23.68,3.31,Male,No,Sun,Dinner,2\n') # test compressed data e = db.read_text('s3://tip-data/t*.gz', storage_options=dict(anon=True)) assert e.take(5) == five_tips # test all keys in bucket c = db.read_text('s3://tip-data/*', storage_options=dict(anon=True)) assert c.npartitions == 4
def one_func_max_api_seq(cells_indir, cells_outdir, max_api_seq_len, min_api_seq_len, code_key): # assert '/scratch/jupyter-pipeline' in cells_outdir or cells_outdir.startswith('/tmp') shutil.rmtree(cells_outdir, ignore_errors=True) Path(cells_outdir).mkdir(exist_ok=True) bag = db.read_text(cells_indir + '/*.jsonl').map(json.loads) kernel_type = \ bag.map(lambda cell: logic_type(cell, max_api_seq_len, min_api_seq_len, code_key)) \ .frequencies() \ .topk(k=50, key=lambda tup: tup[1]) logger.info('Counts of function/class type %s', kernel_type.compute()) with ProgressBar(minimum=15): db.read_text(cells_indir +'/*.jsonl').map(json.loads) \ .filter(lambda cell: logic_type(cell, max_api_seq_len, min_api_seq_len, code_key) in ['1 function', 'pure logic', 'boilerplate']) \ .map(json.dumps).to_textfiles(cells_outdir+'/*.jsonl')
def test_read_text_sync(loop): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/data.txt' % basedir, 'wb') as f: f.write(b'hello\nworld') with cluster(nworkers=3) as (s, [a, b, c]): with Client(s['address'], loop=loop) as e: b = db.read_text('hdfs://%s/*.txt' % basedir) assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
def test_from_s3(): # note we don't test connection modes with aws_access_key and # aws_secret_key because these are not on travis-ci s3fs = pytest.importorskip('s3fs') five_tips = (u'total_bill,tip,sex,smoker,day,time,size\n', u'16.99,1.01,Female,No,Sun,Dinner,2\n', u'10.34,1.66,Male,No,Sun,Dinner,3\n', u'21.01,3.5,Male,No,Sun,Dinner,3\n', u'23.68,3.31,Male,No,Sun,Dinner,2\n') # test compressed data e = db.read_text('s3://tip-data/t*.gz', storage_options=dict(anon=True)) assert e.take(5) == five_tips # test all keys in bucket c = db.read_text('s3://tip-data/*', storage_options=dict(anon=True)) assert c.npartitions == 4
def test_read_text_sync(loop): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/data.txt' % basedir, 'wb') as f: f.write(b'hello\nworld') with cluster(nworkers=3) as (s, [a, b, c]): with Client(('127.0.0.1', s['port']), loop=loop): b = db.read_text('hdfs://%s/*.txt' % basedir) assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
def test__read_text_json_endline(e, s, a): import json with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f: f.write(b'{"x": 1}\n{"x": 2}\n') b = db.read_text('hdfs://%s/text.1.txt' % basedir).map(json.loads) result = yield e.compute(b)._result() assert result == [{"x": 1}, {"x": 2}]
def test__read_text_unicode(e, s, a, b): data = b'abcd\xc3\xa9' with make_hdfs() as (hdfs, basedir): fn = '%s/data.txt' % basedir with hdfs.open(fn, 'wb') as f: f.write(b'\n'.join([data, data])) f = db.read_text('hdfs://' + fn, collection=False) result = yield e.compute(f[0])._result() assert len(result) == 2 assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2 assert len(result[0].strip()) == 5
def read_text(fn, keyname=None, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, fs=None, lazy=True, collection=True, blocksize=2**27, compression=None): warn("distributed.s3.read_text(...) Moved to " "dask.bag.read_text('s3://...')") if keyname is not None: if not keyname.startswith('/'): keyname = '/' + keyname fn = fn + keyname import dask.bag as db result = db.read_text('s3://' + fn, encoding=encoding, errors=errors, linedelimiter=lineterminator, collection=collection, blocksize=blocksize, compression=compression, s3=fs) executor = default_executor(executor) ensure_default_get(executor) if not lazy: if collection: result = executor.persist(result) else: result = executor.compute(result) return result
import json import dask.bag as db bag = db.read_text(os.path.join('..', 'data', 'accounts.*.json.gz')) js = bag.map(json.loads) def sum_amount(total, user): transactions = user['transactions'] return total + sum(transaction['amount'] for transaction in transactions) js.foldby(key='name', binop=sum_amount, initial=0, combine=lambda x, y: x + y, combine_initial=0).compute()
def bag_to_iterator(x, **kwargs): return db.read_text([tf.path for tf in x])