def test_concat(): a = db.from_sequence([1, 2, 3]) b = db.from_sequence([4, 5, 6]) c = db.concat([a, b]) assert list(c) == [1, 2, 3, 4, 5, 6] assert c.name == db.concat([a, b]).name assert b.concat().name != a.concat().name assert b.concat().name == b.concat().name b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3]) assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])
def test_concat(): a = db.from_sequence([1, 2, 3]) b = db.from_sequence([4, 5, 6]) c = db.concat([a, b]) assert list(c) == [1, 2, 3, 4, 5, 6] b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3]) assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])
def get_IDF(words_by_document): unique_words = [] for words_for_single_document in words_by_document: unique_words.append(words_for_single_document.distinct()) large_bag = db.concat(unique_words) frequencies = large_bag.frequencies() idf = frequencies.map(lambda x: (x[0], round(math.log((len(words_by_document) + 1)/x[1]), 10))) return idf
def _apply__flowly__tz__apply_map_concat(bag, transform, rules): return db.concat([ bag.map_partitions(_apply_map_concat_impl, funcs=list(funcs), _flowly_id=flowly_id) # TODO: fix chunk_size for flowly_id, funcs in zip(id_sequence, partition_all(10, transform.funcs)) ])
def _ddfs_to_bag(data, cube): if not isinstance(data, dict): data = {cube.seed_dataset: data} ktk_cube_dataset_ids = sorted(data.keys()) bags = [] for ktk_cube_dataset_id in ktk_cube_dataset_ids: bags.append( db.from_delayed(data[ktk_cube_dataset_id].to_delayed()).map_partitions( _convert_write_bag, ktk_cube_dataset_id=ktk_cube_dataset_id ) ) return (db.concat(bags), ktk_cube_dataset_ids)
def build_vocab(data_file, output_dir, size=50000, lang='en'): """ Builds vocab of <size> from <data_file> and stores it in <output_dir>. """ b = db.read_text(data_file).str.strip().str.lower() if lang is 'fr': b = b.str.replace(u'\u2019', u"'") b = b.map(lambda s: _TOKENIZER.findall(s)).concat().frequencies().topk( size - 4, lambda x: x[1]).pluck(0) a = db.from_sequence(_START_VOCAB) c = db.concat([a, b]).repartition(1) save_path = '%s/vocab%d.%s' % (output_dir, size, lang) with ProgressBar(): c.to_textfiles([save_path]) return save_path
def test_concat(): a = db.from_sequence([1, 2, 3]) b = db.from_sequence([4, 5, 6]) c = db.concat([a, b]) assert list(c) == [1, 2, 3, 4, 5, 6]
import dask.bag as db from pyspark import SparkContext if __name__ == '__main__': collection1 = [n for n in range(0, 6)] collection2 = [n for n in range(4, 10)] sc = SparkContext() rdd1 = sc.parallelize(collection1, 3) rdd2 = sc.parallelize(collection2, 3) res = rdd1.union(rdd2).collect() print(res) rdd1 = db.from_sequence(collection1, npartitions=3) rdd2 = db.from_sequence(collection2, npartitions=3) res = db.concat([rdd1, rdd2]).compute() print(res)
def _apply__flowly__tz__apply_concat(bag, transform, rules): return db.concat([apply(func, bag, rules=rules) for func in transform.funcs])
def _apply__flowly__tz__apply_concat(bag, transform, rules): return db.concat( [apply(func, bag, rules=rules) for func in transform.funcs])
if config.HasField("covid_json_dir"): document_pipeline.perform_document_independent_tasks( config=config, documents=document_pipeline.get_covid_documents(config), ckpt_prefix="covid", semrep_work_dir=semrep_work_dir, ) # At this point, we are going to recover text sources using the checkpoint # module ############################################################################## parsed_sentences = dbag.concat([ checkpoint.checkpoint(name, verbose=False) for name in checkpoint.get_checkpoints_like("*parsed_sentences") ]) # Perform n-gram mining, introduces a new field "ngrams" ngram_sentences = ngram_util.get_frequent_ngrams( analyzed_sentences=parsed_sentences, max_ngram_length=config.phrases.max_ngram_length, min_ngram_support=config.phrases.min_ngram_support, min_ngram_support_per_partition=\ config.phrases.min_ngram_support_per_partition, ngram_sample_rate=config.phrases.ngram_sample_rate, ) ckpt("ngram_sentences") ngram_edges = graph_util.record_to_bipartite_edges( records=ngram_sentences,
from dask.distributed import Client from utils import timer import dask import json import dask.bag as db import os client = Client("tcp://localhost:8786") if not os.path.exists("data"): os.makedirs('data', exist_ok=True) # Create data/ directory b = dask.datasets.make_people() # Make records of people b.map(json.dumps).to_textfiles( 'data/*.json') # Encode as JSON, write to disk b = db.read_text('data/*.json').map(json.loads) b = db.concat([b for _ in range(400)]) with timer("Map/filt/count"): m = b.map(lambda record: record['occupation']) filt = m.filter(lambda record: len(record) > 6) res = m.map(lambda record: len(record)) res = res.count().compute()
def test_concat_after_map(): a = db.from_sequence([1, 2]) b = db.from_sequence([4, 5]) result = db.concat([a.map(inc), b]) assert list(result) == [2, 3, 4, 5]
def _get_pairs( wikipedia_path=None, books_path=None, common_crawl_path=None, wikipedia_lang='en', target_seq_length=128, short_seq_prob=0.1, blocksize=None, num_blocks=None, duplicate_factor=5, sample_ratio=0.9, seed=12345, tokenizer=None, masking=False, masked_lm_ratio=0.15, ): vocab_words = tuple(tokenizer.vocab.keys()) def _to_partition_pairs(partition_documents): partition_documents = tuple(partition_documents) partition_pairs = [] for _ in range(duplicate_factor): for document_index in range(len(partition_documents)): partition_pairs.extend( create_pairs_from_document( partition_documents, document_index, max_seq_length=target_seq_length, short_seq_prob=short_seq_prob, masking=masking, masked_lm_ratio=masked_lm_ratio, vocab_words=vocab_words, )) random.shuffle(partition_pairs) return partition_pairs if num_blocks is not None: if blocksize is not None: raise ValueError( 'Only one of num_blocks or blocksize needs to be set!') blocksize = estimate_block_size( (wikipedia_path, books_path, common_crawl_path), num_blocks, ) bags = [] if wikipedia_path is not None: bags.append( read_wikipedia( wikipedia_path, lang=wikipedia_lang, blocksize=blocksize, sample_ratio=sample_ratio, sample_seed=seed, )) if books_path is not None: bags.append( read_books( books_path, blocksize=blocksize, sample_ratio=sample_ratio, sample_seed=seed, )) if common_crawl_path is not None: bags.append( read_common_crawl( common_crawl_path, blocksize=blocksize, sample_ratio=sample_ratio, sample_seed=seed, )) bag_texts = db.concat(bags) bag_texts = _shuffle_bag_texts(bag_texts) bag_documents = _get_documents(bag_texts, tokenizer) return bag_documents.map_partitions(_to_partition_pairs)
def test_concat(): a = db.from_sequence([1, 2, 3]) b = db.from_sequence([4, 5, 6]) c = db.concat([a, b]) assert list(c) == [1, 2, 3, 4, 5, 6] assert c.name == db.concat([a, b]).name
def concat(cls, hashbags): return hashbags[0].new(db.concat([hb.bag for hb in hashbags]))
# Replace bag with result of ckpt, typically with save / load globals()[name] = dask_checkpoint.checkpoint( bag, name=name, checkpoint_dir=checkpoint_dir, **ckpt_kwargs) if config.HasField( "stop_after_ckpt") and config.stop_after_ckpt == name: print("Stopping early.") exit(0) ############################################################################## # BEGIN PIPELINE # ############################################################################## documents = get_medline_documents(config, download_shared) if config.HasField("covid_json_dir"): documents = dbag.concat([ documents, get_covid_documents(config), ]) ckpt("documents") # Split documents into sentences, filter out too-long and too-short sentences. sentences = documents.map_partitions( text_util.split_sentences, # -- min_sentence_len=config.parser.min_sentence_len, max_sentence_len=config.parser.max_sentence_len, ) ckpt("sentences") # Add POS tagging, lemmas, entitites, and additional data to each sent sentences_with_lemmas = sentences.map_partitions( text_util.analyze_sentences,