Python concat Examples, dask.bag.concat Python Examples

Example #1

0

Show file

File: test_bag.py Project: serazing/dask

def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]

    assert c.name == db.concat([a, b]).name
    assert b.concat().name != a.concat().name
    assert b.concat().name == b.concat().name

    b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3])
    assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])

Example #2

0

Show file

File: test_bag.py Project: rla3rd/dask

def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]

    assert c.name == db.concat([a, b]).name
    assert b.concat().name != a.concat().name
    assert b.concat().name == b.concat().name

    b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3])
    assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])

Example #3

0

Show file

File: test_bag.py Project: fortiema/dask

def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]

    b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3])
    assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])

Example #4

0

Show file

def get_IDF(words_by_document):
    unique_words = []
    for words_for_single_document in words_by_document:
        unique_words.append(words_for_single_document.distinct())
    large_bag = db.concat(unique_words)
    frequencies = large_bag.frequencies()
    idf = frequencies.map(lambda x: (x[0], round(math.log((len(words_by_document) + 1)/x[1]), 10)))
    return idf

Example #5

0

Show file

File: test_bag.py Project: BabeNovelty/dask

def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]

    b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3])
    assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])

Example #6

0

Show file

File: dsk.py Project: Tobbe02/flowly

def _apply__flowly__tz__apply_map_concat(bag, transform, rules):
    return db.concat([
        bag.map_partitions(_apply_map_concat_impl,
                           funcs=list(funcs),
                           _flowly_id=flowly_id)
        # TODO: fix chunk_size
        for flowly_id, funcs in zip(id_sequence,
                                    partition_all(10, transform.funcs))
    ])

Example #7

0

Show file

File: dataframe_cube.py Project: stephan-hesselmann-by/kartothek

def _ddfs_to_bag(data, cube):
    if not isinstance(data, dict):
        data = {cube.seed_dataset: data}

    ktk_cube_dataset_ids = sorted(data.keys())
    bags = []
    for ktk_cube_dataset_id in ktk_cube_dataset_ids:
        bags.append(
            db.from_delayed(data[ktk_cube_dataset_id].to_delayed()).map_partitions(
                _convert_write_bag, ktk_cube_dataset_id=ktk_cube_dataset_id
            )
        )

    return (db.concat(bags), ktk_cube_dataset_ids)

Example #8

0

Show file

def build_vocab(data_file, output_dir, size=50000, lang='en'):
    """
    Builds vocab of <size> from <data_file> and stores it in <output_dir>.
    """
    b = db.read_text(data_file).str.strip().str.lower()
    if lang is 'fr':
        b = b.str.replace(u'\u2019', u"'")
    b = b.map(lambda s: _TOKENIZER.findall(s)).concat().frequencies().topk(
        size - 4, lambda x: x[1]).pluck(0)
    a = db.from_sequence(_START_VOCAB)
    c = db.concat([a, b]).repartition(1)
    save_path = '%s/vocab%d.%s' % (output_dir, size, lang)
    with ProgressBar():
        c.to_textfiles([save_path])
    return save_path

Example #9

0

Show file

File: test_bag.py Project: hainm/dask

def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])

    assert list(c) == [1, 2, 3, 4, 5, 6]

Example #10

0

Show file

import dask.bag as db
from pyspark import SparkContext

if __name__ == '__main__':
    collection1 = [n for n in range(0, 6)]
    collection2 = [n for n in range(4, 10)]

    sc = SparkContext()
    rdd1 = sc.parallelize(collection1, 3)
    rdd2 = sc.parallelize(collection2, 3)
    res = rdd1.union(rdd2).collect()

    print(res)

    rdd1 = db.from_sequence(collection1, npartitions=3)
    rdd2 = db.from_sequence(collection2, npartitions=3)
    res = db.concat([rdd1, rdd2]).compute()

    print(res)

Example #11

0

Show file

File: dsk.py Project: chmp/flowly

def _apply__flowly__tz__apply_concat(bag, transform, rules):
    return db.concat([apply(func, bag, rules=rules) for func in transform.funcs])

Example #12

0

Show file

File: dsk.py Project: Tobbe02/flowly

def _apply__flowly__tz__apply_concat(bag, transform, rules):
    return db.concat(
        [apply(func, bag, rules=rules) for func in transform.funcs])

Example #13

0

Show file

    if config.HasField("covid_json_dir"):
        document_pipeline.perform_document_independent_tasks(
            config=config,
            documents=document_pipeline.get_covid_documents(config),
            ckpt_prefix="covid",
            semrep_work_dir=semrep_work_dir,
        )

    # At this point, we are going to recover text sources using the checkpoint
    # module

    ##############################################################################

    parsed_sentences = dbag.concat([
        checkpoint.checkpoint(name, verbose=False)
        for name in checkpoint.get_checkpoints_like("*parsed_sentences")
    ])

    # Perform n-gram mining, introduces a new field "ngrams"
    ngram_sentences = ngram_util.get_frequent_ngrams(
        analyzed_sentences=parsed_sentences,
        max_ngram_length=config.phrases.max_ngram_length,
        min_ngram_support=config.phrases.min_ngram_support,
        min_ngram_support_per_partition=\
            config.phrases.min_ngram_support_per_partition,
        ngram_sample_rate=config.phrases.ngram_sample_rate,
    )
    ckpt("ngram_sentences")

    ngram_edges = graph_util.record_to_bipartite_edges(
        records=ngram_sentences,

Example #14

0

Show file

File: bag.py Project: filmor/rsds

from dask.distributed import Client
from utils import timer
import dask
import json
import dask.bag as db
import os

client = Client("tcp://localhost:8786")

if not os.path.exists("data"):
    os.makedirs('data', exist_ok=True)  # Create data/ directory
    b = dask.datasets.make_people()  # Make records of people
    b.map(json.dumps).to_textfiles(
        'data/*.json')  # Encode as JSON, write to disk

b = db.read_text('data/*.json').map(json.loads)
b = db.concat([b for _ in range(400)])

with timer("Map/filt/count"):
    m = b.map(lambda record: record['occupation'])
    filt = m.filter(lambda record: len(record) > 6)
    res = m.map(lambda record: len(record))
    res = res.count().compute()

Example #15

0

Show file

File: dsk.py Project: chmp/flowly

def _apply__flowly__tz__apply_map_concat(bag, transform, rules):
    return db.concat([
        bag.map_partitions(_apply_map_concat_impl, funcs=list(funcs), _flowly_id=flowly_id)
        # TODO: fix chunk_size
        for flowly_id, funcs in zip(id_sequence, partition_all(10, transform.funcs))
    ])

Example #16

0

Show file

File: test_bag.py Project: BabeNovelty/dask

def test_concat_after_map():
    a = db.from_sequence([1, 2])
    b = db.from_sequence([4, 5])
    result = db.concat([a.map(inc), b])
    assert list(result) == [2, 3, 4, 5]

Example #17

0

Show file

def _get_pairs(
    wikipedia_path=None,
    books_path=None,
    common_crawl_path=None,
    wikipedia_lang='en',
    target_seq_length=128,
    short_seq_prob=0.1,
    blocksize=None,
    num_blocks=None,
    duplicate_factor=5,
    sample_ratio=0.9,
    seed=12345,
    tokenizer=None,
    masking=False,
    masked_lm_ratio=0.15,
):
    vocab_words = tuple(tokenizer.vocab.keys())

    def _to_partition_pairs(partition_documents):
        partition_documents = tuple(partition_documents)
        partition_pairs = []
        for _ in range(duplicate_factor):
            for document_index in range(len(partition_documents)):
                partition_pairs.extend(
                    create_pairs_from_document(
                        partition_documents,
                        document_index,
                        max_seq_length=target_seq_length,
                        short_seq_prob=short_seq_prob,
                        masking=masking,
                        masked_lm_ratio=masked_lm_ratio,
                        vocab_words=vocab_words,
                    ))
        random.shuffle(partition_pairs)
        return partition_pairs

    if num_blocks is not None:
        if blocksize is not None:
            raise ValueError(
                'Only one of num_blocks or blocksize needs to be set!')
        blocksize = estimate_block_size(
            (wikipedia_path, books_path, common_crawl_path),
            num_blocks,
        )

    bags = []
    if wikipedia_path is not None:
        bags.append(
            read_wikipedia(
                wikipedia_path,
                lang=wikipedia_lang,
                blocksize=blocksize,
                sample_ratio=sample_ratio,
                sample_seed=seed,
            ))
    if books_path is not None:
        bags.append(
            read_books(
                books_path,
                blocksize=blocksize,
                sample_ratio=sample_ratio,
                sample_seed=seed,
            ))
    if common_crawl_path is not None:
        bags.append(
            read_common_crawl(
                common_crawl_path,
                blocksize=blocksize,
                sample_ratio=sample_ratio,
                sample_seed=seed,
            ))
    bag_texts = db.concat(bags)
    bag_texts = _shuffle_bag_texts(bag_texts)
    bag_documents = _get_documents(bag_texts, tokenizer)
    return bag_documents.map_partitions(_to_partition_pairs)

Example #18

0

Show file

def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]
    assert c.name == db.concat([a, b]).name

Example #19

0

Show file

File: util.py Project: karmaresearch/takco

 def concat(cls, hashbags):
     return hashbags[0].new(db.concat([hb.bag for hb in hashbags]))

Example #20

0

Show file

File: test_bag.py Project: serazing/dask

def test_concat_after_map():
    a = db.from_sequence([1, 2])
    b = db.from_sequence([4, 5])
    result = db.concat([a.map(inc), b])
    assert list(result) == [2, 3, 4, 5]

Example #21

0

Show file

            # Replace bag with result of ckpt, typically with save / load
            globals()[name] = dask_checkpoint.checkpoint(
                bag, name=name, checkpoint_dir=checkpoint_dir, **ckpt_kwargs)
        if config.HasField(
                "stop_after_ckpt") and config.stop_after_ckpt == name:
            print("Stopping early.")
            exit(0)

    ##############################################################################
    # BEGIN PIPELINE                                                             #
    ##############################################################################

    documents = get_medline_documents(config, download_shared)
    if config.HasField("covid_json_dir"):
        documents = dbag.concat([
            documents,
            get_covid_documents(config),
        ])
    ckpt("documents")

    # Split documents into sentences, filter out too-long and too-short sentences.
    sentences = documents.map_partitions(
        text_util.split_sentences,
        # --
        min_sentence_len=config.parser.min_sentence_len,
        max_sentence_len=config.parser.max_sentence_len,
    )
    ckpt("sentences")

    # Add POS tagging, lemmas, entitites, and additional data to each sent
    sentences_with_lemmas = sentences.map_partitions(
        text_util.analyze_sentences,