Exemple #1
0
def test__read_text(c, s, a, b):
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f:
            f.write('Alice 100\nBob 200\nCharlie 300'.encode())

        with hdfs.open('%s/text.2.txt' % basedir, 'wb') as f:
            f.write('Dan 400\nEdith 500\nFrank 600'.encode())

        with hdfs.open('%s/other.txt' % basedir, 'wb') as f:
            f.write('a b\nc d'.encode())

        b = db.read_text('hdfs://%s/text.*.txt' % basedir)
        yield gen.sleep(0.5)
        assert not s.tasks

        import dask
        b.compute(get=dask.get)

        coll = b.str.strip().str.split().map(len)

        future = c.compute(coll)
        yield gen.sleep(0.5)
        result = yield future._result()
        assert result == [2, 2, 2, 2, 2, 2]

        b = db.read_text('hdfs://%s/other.txt' % basedir)
        b = c.persist(b)
        future = c.compute(b.str.split().concat())
        result = yield future._result()
        assert result == ['a', 'b', 'c', 'd']
def test__read_text(e, s, a, b):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/text.1.txt', 'wb') as f:
            f.write('Alice 100\nBob 200\nCharlie 300'.encode())

        with hdfs.open('/tmp/test/text.2.txt', 'wb') as f:
            f.write('Dan 400\nEdith 500\nFrank 600'.encode())

        with hdfs.open('/tmp/test/other.txt', 'wb') as f:
            f.write('a b\nc d'.encode())

        b = db.read_text('hdfs:///tmp/test/text.*.txt',
                         collection=True)
        yield gen.sleep(0.5)
        assert not s.tasks

        import dask
        b.compute(get=dask.get)

        future = e.compute(b.str.strip().str.split().map(len))
        yield gen.sleep(0.5)
        result = yield future._result()
        assert result == [2, 2, 2, 2, 2, 2]

        b = db.read_text('hdfs:///tmp/test/other.txt', collection=True)
        b = e.persist(b)
        future = e.compute(b.str.split().concat())
        result = yield future._result()
        assert result == ['a', 'b', 'c', 'd']

        L = db.read_text('hdfs:///tmp/test/text.*.txt', collection=False)
        assert all(isinstance(x, Delayed) for x in L)
Exemple #3
0
def test_read_text():
    with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns:
        assert set(line.strip() for line in db.read_text(fns)) == \
                set('ABCD')
        assert set(line.strip() for line in db.read_text('a*.log')) == \
                set('ABCD')

    assert raises(ValueError, lambda: db.read_text('non-existent-*-path'))
Exemple #4
0
def test_read_text():
    with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns:
        assert set(line.strip() for line in db.read_text(fns)) == \
                set('ABCD')
        assert set(line.strip() for line in db.read_text('a*.log')) == \
                set('ABCD')

    assert raises(ValueError, lambda: db.read_text('non-existent-*-path'))
Exemple #5
0
def test_read_text_large_gzip():
    with tmpfile('gz') as fn:
        f = GzipFile(fn, 'wb')
        f.write(b'Hello, world!\n' * 100)
        f.close()

        with pytest.raises(ValueError):
            b = db.read_text(fn, blocksize=100, linedelimiter='\n')

        c = db.read_text(fn)
        assert c.npartitions == 1
Exemple #6
0
def test_read_text_encoding():
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write((u'你好!' + os.linesep).encode('gb18030') * 100)
        b = db.read_text(fn, blocksize=100, encoding='gb18030')
        c = db.read_text(fn, encoding='gb18030')
        assert len(b.dask) > 5
        assert list(map(lambda x: x.encode('utf-8'), b)) == list(map(lambda x: x.encode('utf-8'), c))

        d = db.read_text([fn], blocksize=100, encoding='gb18030')
        assert list(b) == list(d)
Exemple #7
0
def test_read_text_encoding():
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write((u'你好!' + os.linesep).encode('gb18030') * 100)
        b = db.read_text(fn, blocksize=100, encoding='gb18030')
        c = db.read_text(fn, encoding='gb18030')
        assert len(b.dask) > 5
        assert list(map(lambda x: x.encode('utf-8'), b)) == list(map(lambda x: x.encode('utf-8'), c))

        d = db.read_text([fn], blocksize=100, encoding='gb18030')
        assert list(b) == list(d)
Exemple #8
0
def test_read_text_large():
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write(('Hello, world!' + os.linesep).encode() * 100)
        b = db.read_text(fn, blocksize=100)
        c = db.read_text(fn)
        assert len(b.dask) > 5
        assert list(map(str, b)) == list(map(str, c))

        d = db.read_text([fn], blocksize=100)
        assert list(b) == list(d)
Exemple #9
0
def test_read_text_large():
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write(('Hello, world!' + os.linesep).encode() * 100)
        b = db.read_text(fn, blocksize=100)
        c = db.read_text(fn)
        assert len(b.dask) > 5
        assert list(map(str, b.str.strip())) == list(map(str, c.str.strip()))

        d = db.read_text([fn], blocksize=100)
        assert list(b) == list(d)
Exemple #10
0
def partition_files(fn="map.osm", force_refresh=False):
    """
    parameters
    ----------
    fn: (str) is the name of file that should exist

    returns:
    -------
    a dask bag of python dictionaries

    side-effect:
    -------
    create a single file eg. map_slxml.osm in which every line is a single xml element
    """

    assert (os.path.exists(fn))

    fn_prefix = fn.split('.')[0]

    mod_time_map = os.path.getmtime(fn)
    mod_time_p1 = os.path.getmtime(fn_prefix +
                                   '_partition_0.osm') if os.path.exists(
                                       fn_prefix + '_partition_0.osm') else 0
    if mod_time_map < mod_time_p1 and not force_refresh:
        print("partition files still fresh")
        b = db.read_text(fn_prefix + '_partition_*.osm')
    else:
        print("making fresh partition files 9")

        out_fn = fn_prefix + '_slxml.osm'

        # create file with one element per line
        # IMPORTANT: must write text with 'utf-8' otherwise it will choke
        # trying to write ascii
        with open(out_fn, 'w', encoding='utf-8') as outputf:
            for i, e in enumerate(get_element(fn)):
                # from docs: if encoding is _not_ 'unicode' a byte string will be generated(!)
                s = ET.tostring(e, encoding='unicode')
                s1 = ' '.join(s.strip().split())
                outputf.write(s1 + '\n')
        line_count = i

        num_cores = multiprocessing.cpu_count()
        partition_length = line_count // num_cores + 1
        fsize_bytes = os.path.getsize(out_fn)

        # create multi-partitioned dask bag from file with 1 element per line
        # it seems like we are obliged to strip the line ending...
        b = db.read_text(out_fn, fsize_bytes // num_cores + 1).map(str.strip)

        # create partitioned files from dask bag
        b.to_textfiles(fn_prefix + '_partition_*.osm')

    return b.map(ET.XML).map(element2dict)
Exemple #11
0
def test_read_text_large_gzip():
    with tmpfile('gz') as fn:
        f = GzipFile(fn, 'wb')
        f.write(b'Hello, world!\n' * 100)
        f.close()

        with pytest.raises(ValueError):
            db.read_text(fn, blocksize=100, linedelimiter='\n')

        c = db.read_text(fn)
        assert c.npartitions == 1
Exemple #12
0
def test_read_text_large_gzip():
    with tmpfile("gz") as fn:
        f = GzipFile(fn, "wb")
        f.write(b"Hello, world!\n" * 100)
        f.close()

        with pytest.raises(ValueError):
            db.read_text(fn, blocksize=50, linedelimiter="\n")

        c = db.read_text(fn)
        assert c.npartitions == 1
Exemple #13
0
def test_read_text_encoding():
    with tmpfile() as fn:
        with open(fn, "wb") as f:
            f.write((u"你好!" + os.linesep).encode("gb18030") * 100)
        b = db.read_text(fn, blocksize=100, encoding="gb18030")
        c = db.read_text(fn, encoding="gb18030")
        assert len(b.dask) > 5
        assert list(b.str.strip().map(lambda x: x.encode("utf-8"))) == list(
            c.str.strip().map(lambda x: x.encode("utf-8")))

        d = db.read_text([fn], blocksize=100, encoding="gb18030")
        assert list(b) == list(d)
Exemple #14
0
def test_read_text_large_gzip():
    with tmpfile("gz") as fn:
        data = b"Hello, world!\n" * 100
        f = GzipFile(fn, "wb")
        f.write(data)
        f.close()

        with pytest.raises(ValueError):
            # not allowed blocks when compressed
            db.read_text(fn, blocksize=50, linedelimiter="\n")

        c = db.read_text(fn, blocksize=None)
        assert c.npartitions == 1
        assert "".join(c.compute()) == data.decode()
Exemple #15
0
def execute_queries(settings, query_parameters, output_path):
    # TOPOLISH: test and doc
    # Create dask delayed object, make rdkit mol object if possible.
    if settings['reactant_db']:
        input_bag = db.read_text(settings['reactant_db'],
                                 blocksize=settings["partition_size"])
        input_bag = input_bag.map(lambda x: Chem.MolFromSmiles(x)).remove(
            lambda x: x is None)
    else:
        print(
            'Warning no reactant database defined, this only works if all reactants are loaded from sequence or file.'
        )

    # Creating a list of product bags and the compute section.
    reactant_bags = []
    for i, query in enumerate(query_parameters):
        # Query and repartition mol database
        if query["from_file"]:
            bag = db.read_text(query['from_file'])
            bag = bag.map(lambda x: Chem.MolFromSmiles(x)).remove(
                lambda x: x is None)
        elif query["from_sequence"]:
            bag = db.from_sequence(query["from_sequence"])
            bag = bag.map(lambda x: Chem.MolFromSmiles(x)).remove(
                lambda x: x is None)
        else:
            bag = construct_query(query, input_bag)
        reactant_bags.append(bag)

    # Compute and enumerate.
    print('Querying database and/or reading input files...\n')
    with ProgressBar():
        reactants_lists = dask.compute(*reactant_bags)
        # reactants_lists = [common.zfill_enum(i, 6) for i in reactants_lists]
        reactants_lists = [helpers.enum(i) for i in reactants_lists]

    # Write reactants to file.
    print('Writing reactants to file...')
    written = []
    for i, reactants_list in enumerate(reactants_lists):
        output_file = output_path + f"reactant{str(i).zfill(2)}.smi"
        written.append(output_file)
        with open(output_file, 'w') as f:
            f.write("canonical_smiles\tindex")
            for line in reactants_list:
                f.write(f"\n{Chem.MolToSmiles(line[0])}\t{line[1]}")

    return reactants_lists
Exemple #16
0
def load_data(path, chunks):

    raw_bag = read_text(path) \
                .str.strip() \
                .map(row_to_numpy)

    return da.stack(raw_bag, axis=0)
Exemple #17
0
def combine_files(prefix):
    """Takes a prefix and finds all files of the form {prefix}[*1].[*2].txt and combines all the files with [*1] into
    one .txt file with name {prefix}[*1].txt. update 2-03-2020: now also enumerates compounds.

    Parameters
    ----------
    prefix: str

    Returns
    -------
    None
    """
    files = glob.glob(f"{prefix}*_*.txt")
    cluster_prefixes = set(['.'.join(x.split('.')[:-2]) for x in files])
    prefix_dict = {}

    for p in cluster_prefixes:
        prefix_dict[p] = glob.glob(f"{p}.*.txt")

    for k, v in prefix_dict.items():
        bag = db.read_text(v)
        with gzip.open(f'{k}.txt.gz', 'wt') as f:
            lines = [line.strip('\n') for line in bag.compute()]
            content = ''.join([f'{b}\t{a}\n' for a, b in enumerate(lines)])
            f.write(content)
        [os.remove(x) for x in v]
Exemple #18
0
    def test_washing_with_dask(self):
        """Bit more elaborate of a test to see if rdkit handles a set of molecules in a consistent way in coming versions
        and to see how dask handles the used chem_functions functions.
        """

        expected = ['CC(C)=CCC/C(C)=C\\CC/C(C)=C\\CO', 'CC12CC(O)C(CC1=O)C2(C)C', 'Oc1cc(C2CCNCC2)on1',
                    'Cn1ncc2cc(CN)ccc21', 'O=C(O)c1cc(Cl)cs1', 'Cc1cc(CN)ncc1Br', 'CO[C@@H](C)[C@@H](N)C(=O)O',
                    'Nc1ccc(Br)c(F)c1[N+](=O)[O-]', 'Cc1ccc(F)c(C#N)n1', 'Cc1ccc(F)c(CN)n1']

        from rdkit import Chem
        from MCR import chem_functions
        import dask.bag as db
        from rdkit import RDLogger, rdBase

        rdBase.DisableLog('rdApp.error')
        RDLogger.DisableLog('rdApp.info')

        bag = db.read_text("../tests/test_data/test_db.smi", blocksize=16e6)
        bag = bag.map(lambda x: Chem.MolFromSmiles(x)).filter(lambda x: x is not None)
        bag = bag.map(chem_functions.remove_salts_mol)
        bag = bag.map(chem_functions.decharge_mol)
        bag = bag.map(chem_functions.get_largest_fragment_mol)
        bag = bag.map(chem_functions.standardize_mol)

        self.assertEqual([Chem.MolToSmiles(x) for x in bag.take(10)], expected)
Exemple #19
0
def test_bag():
    b = db.from_sequence([1, 2, 3, 4, 5, 6], npartitions=2)
    b.compute()
    # b = db.read_text('myfile.txt')
    # b = db.read_text('myfile.*.txt.gz')
    b = db.read_text('myfile.*.csv').str.strip().str.split(',')

    # to_textfiles
    # 将dask Bag写入磁盘,每个分区一个文件名,每个元素一行。
    def name(i):
        return str(date(2015, 1, 1) + i * timedelta(days=1))

    b.to_textfiles('/path/to/data/*.json.gz', name_function=name)

    b = db.from_sequence([{
        'name': 'Alice',
        'balance': 100
    }, {
        'name': 'Bob',
        'balance': 200
    }, {
        'name': 'Charlie',
        'balance': 300
    }],
                         npartitions=2)
    df = b.to_dataframe()
    df.head()
    pass
Exemple #20
0
def get_files(src):
    try:
        b = db.read_text(src).map(json.loads)
    except json.decoder.JSONDecodeError:
        print("bork")

    print(b.take(2))
Exemple #21
0
def extract(indir, outdir, nb_vizdir, context_len):
    '''Get all dataset records from solution cells.'''
    logger.info('')
    def nb_to_rec_format(nb):
        return {'contents': json.dumps(nb),
                'repo': nb['metadata']['repo'],
                'path': nb['metadata']['path']}

    nbs = (db.read_text(indir+'/*.jsonl', blocksize='120mib').
           map(json.loads).
           # for compatibility with extraction code above change back to rec format, so
           # we can group by repo
           map(nb_to_rec_format)
           .to_dataframe())

    with ProgressBar():
        nb_cells = nbs.groupby('repo').apply(
            (lambda group: get_solution_cells(group, nb_vizdir, context_len)), meta=object).compute()

        logger.info('num repos %s', len(nb_cells))
        logger.info('num repos at least 1 solution extracted %s', len([c for c in nb_cells if c]))

    nb_cells = nb_cells.tolist()
    cells = [c for n in nb_cells for c in n if n and c]
    logger.info('num extracted solution cells %s', len(cells))
    jdumpl(cells, outdir + '/cells.jsonl')
Exemple #22
0
def test_gh715():
    bin_data = u'\u20ac'.encode('utf-8')
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write(bin_data)
        a = db.read_text(fn)
        assert a.compute()[0] == bin_data.decode('utf-8')
Exemple #23
0
def read_text(path, encoding='utf-8', errors='strict', lineterminator='\n',
               executor=None, hdfs=None, lazy=True, collection=True):
    """ Read text lines from HDFS

    Parameters
    ----------
    path: string
        filename or globstring of files on HDFS
    collection: boolean, optional
        Whether or not to return a high level collection
    lazy: boolean, optional
        Whether or not to start reading immediately

    Returns
    -------
    Dask bag (if collection=True) or Futures or dask values
    """
    warn("hdfs.read_text moved to dask.bag.read_text('hdfs://...')")
    import dask.bag as db
    result = db.read_text('hdfs://' + path, encoding=encoding, errors=errors,
            linedelimiter=lineterminator, hdfs=hdfs, collection=collection)

    executor = default_executor(executor)
    ensure_default_get(executor)
    if not lazy:
        if collection:
            result = executor.persist(result)
        else:
            result = executor.compute(result)

    return result
Exemple #24
0
def test_gh715():
    bin_data = u'\u20ac'.encode('utf-8')
    with tmpfile() as fn:
        with open(fn, 'wb') as f:
            f.write(bin_data)
        a = db.read_text(fn)
        assert a.compute()[0] == bin_data.decode('utf-8')
Exemple #25
0
def read_text(path,
              encoding='utf-8',
              errors='strict',
              lineterminator='\n',
              executor=None,
              hdfs=None,
              lazy=True,
              collection=True):
    warn("hdfs.read_text moved to dask.bag.read_text('hdfs://...')")
    import dask.bag as db
    result = db.read_text('hdfs://' + path,
                          encoding=encoding,
                          errors=errors,
                          linedelimiter=lineterminator,
                          hdfs=hdfs,
                          collection=collection)

    executor = default_executor(executor)
    ensure_default_get(executor)
    if not lazy:
        if collection:
            result = executor.persist(result)
        else:
            result = executor.compute(result)

    return result
Exemple #26
0
 def load(self):
     if self.ext == "json":
         return db.read_text(self.loc, blocksize=1e7).map(json.loads)
     elif self.ext == "pickle":
         return db.from_sequence(glob(self.loc))
     else:
         raise Exception(self.ext + " ChunkedFile not supported")
Exemple #27
0
def test_gh715():
    bin_data = u"\u20ac".encode("utf-8")
    with tmpfile() as fn:
        with open(fn, "wb") as f:
            f.write(bin_data)
        a = db.read_text(fn)
        assert a.compute()[0] == bin_data.decode("utf-8")
def preprocess(jsonl):
    """Ingest data and preprocess comment text"""
    bag = (
        db.read_text(jsonl, blocksize="10MiB")
        .map(json.loads)
        .map(
            lambda r: {
                "created_utc": r["created_utc"],
                "subreddit": r["subreddit"],
                "text": regex_replace(r["body"]),
            }
        )
    )
    df = bag.to_dataframe()

    df = df[df["text"].str.len() > 30]
    df["created_utc"] = dd.to_datetime(df["created_utc"], unit="s")

    ## dask and spacy multiprocessing don't play nicely
    ## nlp.pipe might not be the fastest way to preprocessing
    df = df.compute()
    df["tokens"] = tokenize(df["text"].astype("unicode"))

    df = df[df["tokens"] != ""]
    df = df.drop("text", axis=1)

    return df
Exemple #29
0
def test_to_textfiles_name_function_preserves_order():
    seq = [
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "o",
        "p",
    ]
    b = db.from_sequence(seq, npartitions=16)
    with tmpdir() as dn:
        b.to_textfiles(dn)

        out = (db.read_text(os.path.join(
            dn, "*"), encoding="ascii").map(str).map(str.strip).compute())
        assert seq == out
Exemple #30
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--from_dir_prefix')
    parser.add_argument('-t', '--to_dir_prefix')
    parser.add_argument('-u',
                        '--path2udp_model',
                        default='russian-syntagrus-ud-2.0-170801.udpipe')
    parser.add_argument('-n', '--cpu_n', default=5, type=int)
    args = parser.parse_args()
    # dfunc.MODELFILE4UDPIPE = args.path2udp_model
    # dfunc.set_model(args.path2udp_model)
    with dask.config.set(pool=ThreadPool(args.cpu_n)):
        bag = db.read_text(args.from_dir_prefix)
        pbar = ProgressBar()
        pbar.register()
        ddf = bag.to_dataframe(columns=['text'])
        ddf['text'] = ddf['text'].apply(dfunc.skip_empty, meta=('x', 'f8'))
        ddf = ddf.dropna()
        ddf['rec'] = ddf['text'].apply(dfunc.get_rec_info, meta=('x', 'f8'))
        ddf['text'] = ddf['text'].apply(dfunc.spec_tok_add, meta=('x', 'f8'))
        ddf['norm_text'] = ddf['text'].apply(dfunc.normalization1,
                                             meta=('x', 'f8'))
        # udpipe_sent_and_tok = dfunc.get_udpipe_sent_and_tok(args.path2udp_model)
        # udpipe_sent_and_tok = dfunc.get_udpipe_sent_and_tok('/home/den/Documents/elmo/data_preparing/rutwitter/russian-syntagrus-ud-2.0-170801.udpipe')
        ddf['norm_text'] = ddf['text'].apply(dfunc.udpipe_sent_and_tok,
                                             meta=('x', 'f8'))
        # ddf['norm_text'] = ddf['text'].apply(dfunc.nltk_sent_and_tok, meta=('x', 'f8'))
        ddf['norm_text'] = ddf['norm_text'].apply(dfunc.normalization2,
                                                  meta=('x', 'f8'))
        ddf['rec_text'] = ddf.apply(dfunc.recovery, meta=('x', 'f8'), axis=1)
        ddf['cleaned_text'] = ddf['norm_text'].apply(dfunc.lower_case,
                                                     meta=('x', 'f8'))
        ddf[['rec_text', 'cleaned_text']].to_csv(args.to_dir_prefix)
Exemple #31
0
def filter_cells_nl_above_dataframe(cells_indir, cells_outdir, nbs_indir, max_dist):
    '''Filter cells if markdown is more than max_dist away.'''
    shutil.rmtree(cells_outdir, ignore_errors=True)
    Path(cells_outdir).mkdir(exist_ok=True)

    logging.info(f'Num cells before nl dist %s filter %s', max_dist, db.read_text(cells_indir+'/*.jsonl').count().compute())


    # this is required by dask
    cells_meta= {'cell_type': str,
                 'execution_count': int,
                 'metadata': object,
                 'outputs': object,
                 'source': str,
                 'og_cell': object,
                 'nb_index': int}
    # nb_index is added by the add_key function so important to have it
    nb_meta= {'cells': object, 'metadata': object, 'nbformat': int, 'nbformat_minor': int,
              'nb_index': int}

    cells_df = db.read_text(cells_indir +'/*.jsonl').map(json.loads).map(lambda js: add_key(js,cell=True)).to_dataframe(meta=cells_meta)
    nbs_df = db.read_text(nbs_indir +'/*.jsonl').map(json.loads).map(lambda js: add_key(js)).to_dataframe(meta=nb_meta)

    def get_cell(row):
        '''if not nl above return high int representing infinite distance'''
        cells = row.cells
        cell_index = row.metadata_cell['cell_index']
        reversed_cells_before = reversed(cells[:cell_index])
        # iterate over all cells above starting with one directly above
        for i, c in enumerate(reversed_cells_before):
            if i+1 <= max_dist and is_markdown(c):
                return row.og_cell
        # print(row.og_cell['metadata']['nb_orig_url'])
        return {}



    with ProgressBar(minimum=15):
        (cells_df.merge(nbs_df, on='nb_index', suffixes=['_cell', '_nb'])
     .apply(get_cell, meta=object, axis=1).to_bag()
     .filter(lambda js: 'source' in js)
     .map(json.dumps).to_textfiles(cells_outdir + '/*.jsonl'))



    logging.info(f'Num cells after nl dist %s filter %s', max_dist, db.read_text(cells_outdir+'/*.jsonl').count().compute())
Exemple #32
0
def get_code_context_records(cells_indir, cells_outdir, nbs_indir, context_len,
                             max_tokens):
    '''Convert cells into the dataset format where each record will store the
    context/code pairs.'''
    logger.info('')
    shutil.rmtree(cells_outdir, ignore_errors=True)
    Path(cells_outdir).mkdir(exist_ok=True)

    # this is required by dask
    cells_meta = {
        'cell_type': str,
        'execution_count': int,
        'metadata': object,
        'outputs': object,
        'source': str,
        'og_cell': object,
        'nb_index': int
    }
    # nb_index is added by the add_key function so important to have it
    nb_meta = {
        'cells': object,
        'metadata': object,
        'nbformat': int,
        'nbformat_minor': int,
        'nb_index': int
    }

    # add nb_index key first to be able to join
    cells_df = db.read_text(cells_indir + '/*.jsonl').map(
        json.loads).map(lambda js: add_key(js, cell=True)).to_dataframe(
            meta=cells_meta)
    nbs_df = db.read_text(nbs_indir + '/*.jsonl').map(
        json.loads).map(lambda js: add_key(js)).to_dataframe(meta=nb_meta)

    with ProgressBar(minimum=15):
        # join each cell with nb to compute dataset record
        (cells_df.merge(nbs_df, on='nb_index',
                        suffixes=['_cell',
                                  '_nb']).apply(compute_dataset_record,
                                                context_len=context_len,
                                                max_tokens=max_tokens,
                                                meta=object,
                                                axis=1).to_bag()
         # records with len greater than max tokens will be none so we filter for valid records
         .filter(lambda js: js and js['code_tokens']).map(
             json.dumps).to_textfiles(cells_outdir + '/*.jsonl'))
Exemple #33
0
def test_to_textfiles_name_function_preserves_order():
    seq = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p']
    b = db.from_sequence(seq, npartitions=16)
    with tmpdir() as dn:
        b.to_textfiles(dn)

        out = db.read_text(os.path.join(dn, "*"), encoding='ascii').map(str).map(str.strip).compute()
        assert seq == out
Exemple #34
0
def test_to_textfiles_name_function_preserves_order():
    seq = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p']
    b = db.from_sequence(seq, npartitions=16)
    with tmpdir() as dn:
        b.to_textfiles(dn)

        out = db.read_text(os.path.join(dn, "*"), encoding='ascii').map(str).map(str.strip).compute()
        assert seq == out
Exemple #35
0
def test_read_text_compression(e, s, a, b):
    import dask.bag as db
    b = db.read_text('s3://distributed-test/csv/gzip/*', compression='gzip',
                     blocksize=None, storage_options=dict(anon=True))
    result = yield e.compute(b)._result()
    assert result == [line + '\n' for k in sorted(csv_files)
                                  for line in csv_files[k].decode().split('\n')
                                  if line]
Exemple #36
0
def filter_boilerplate(input_bucket, output_bucket):

    boilerplate_dataframe_path = '/home/romanell/Downloads/bp.pkl'
    t = Timer()
    bp_df = from_pandas(pd.read_pickle(boilerplate_dataframe_path), chunksize=10000).reset_index().persist()

    nps = list_newspapers(input_bucket)

    for np in nps:
        passim_rebuilt_files = fixed_s3fs_glob(f'{os.path.join(input_bucket, np)}/*.bz2')

        # we want to keep the number of resulting files as the one of input files
        n_partitions = len(passim_rebuilt_files)
        print(f'Crunching {np}: {len(passim_rebuilt_files)} files')

        # detect whether the current item has already been processed
        existing_files = fixed_s3fs_glob(f'{output_bucket}{np}*.bz2')

        # skip newspapers that don't need to be processed
        if np == 'NZZ':
            print('NZZ, skipping')
            continue
        elif len(existing_files) > 0:
            print(f'{np} already done, move on')
            continue

        passim_data_df = (
            db.read_text(passim_rebuilt_files, storage_options=IMPRESSO_STORAGEOPT)
            .map(json.loads)
            .map(lambda d: {'id': d['id'], 'document': d})
            .to_dataframe()
            .set_index('id')
            .persist()
        )

        np_bp_df = bp_df[bp_df.id.str.contains(np)].set_index('id').compute()

        tmp_df = passim_data_df.join(np_bp_df, how='outer')

        filtered_df = tmp_df[tmp_df.is_boilerplate.isnull()]

        output_files = [
            f'{output_bucket}{np}-{str(n+1).zfill(4)}.jsonl.bz2' for n, f in enumerate(passim_rebuilt_files)
        ]

        future = (
            filtered_df.reset_index()
            .to_bag()
            .map(lambda i: i[1])
            .map(json.dumps)
            .repartition(n_partitions)
            .to_textfiles(output_files, storage_options=IMPRESSO_STORAGEOPT)
        )

        print(f'Written {len(output_files)} output files; first five: {output_files[:5]}')

        print(f'Done with {np}. It took: {t.tick()}')
        print('------------------------------------')
Exemple #37
0
def test_read_text_sync(loop):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/data.txt', 'wb') as f:
            f.write(b'hello\nworld')

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Executor(('127.0.0.1', s['port']), loop=loop):
                b = db.read_text('hdfs:///tmp/test/*.txt')
                assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
Exemple #38
0
def test_from_s3():
    # note we don't test connection modes with aws_access_key and
    # aws_secret_key because these are not on travis-ci
    pytest.importorskip('s3fs')

    five_tips = (u'total_bill,tip,sex,smoker,day,time,size\n',
                 u'16.99,1.01,Female,No,Sun,Dinner,2\n',
                 u'10.34,1.66,Male,No,Sun,Dinner,3\n',
                 u'21.01,3.5,Male,No,Sun,Dinner,3\n',
                 u'23.68,3.31,Male,No,Sun,Dinner,2\n')

    # test compressed data
    e = db.read_text('s3://tip-data/t*.gz', storage_options=dict(anon=True))
    assert e.take(5) == five_tips

    # test all keys in bucket
    c = db.read_text('s3://tip-data/*', storage_options=dict(anon=True))
    assert c.npartitions == 4
Exemple #39
0
def one_func_max_api_seq(cells_indir, cells_outdir, max_api_seq_len,
                         min_api_seq_len, code_key):
    # assert '/scratch/jupyter-pipeline' in cells_outdir or cells_outdir.startswith('/tmp')
    shutil.rmtree(cells_outdir, ignore_errors=True)
    Path(cells_outdir).mkdir(exist_ok=True)

    bag = db.read_text(cells_indir + '/*.jsonl').map(json.loads)
    kernel_type = \
        bag.map(lambda cell: logic_type(cell, max_api_seq_len, min_api_seq_len, code_key)) \
            .frequencies() \
            .topk(k=50, key=lambda tup: tup[1])

    logger.info('Counts of function/class type %s', kernel_type.compute())

    with ProgressBar(minimum=15):
        db.read_text(cells_indir +'/*.jsonl').map(json.loads) \
        .filter(lambda cell: logic_type(cell, max_api_seq_len, min_api_seq_len, code_key) in ['1 function', 'pure logic', 'boilerplate']) \
        .map(json.dumps).to_textfiles(cells_outdir+'/*.jsonl')
Exemple #40
0
def test_read_text_sync(loop):
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/data.txt' % basedir, 'wb') as f:
            f.write(b'hello\nworld')

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Client(s['address'], loop=loop) as e:
                b = db.read_text('hdfs://%s/*.txt' % basedir)
                assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
Exemple #41
0
def test_from_s3():
    # note we don't test connection modes with aws_access_key and
    # aws_secret_key because these are not on travis-ci
    s3fs = pytest.importorskip('s3fs')

    five_tips = (u'total_bill,tip,sex,smoker,day,time,size\n',
                 u'16.99,1.01,Female,No,Sun,Dinner,2\n',
                 u'10.34,1.66,Male,No,Sun,Dinner,3\n',
                 u'21.01,3.5,Male,No,Sun,Dinner,3\n',
                 u'23.68,3.31,Male,No,Sun,Dinner,2\n')

    # test compressed data
    e = db.read_text('s3://tip-data/t*.gz', storage_options=dict(anon=True))
    assert e.take(5) == five_tips

    # test all keys in bucket
    c = db.read_text('s3://tip-data/*', storage_options=dict(anon=True))
    assert c.npartitions == 4
Exemple #42
0
def test_read_text_sync(loop):
    with make_hdfs() as hdfs:
        with hdfs.open('/tmp/test/data.txt', 'wb') as f:
            f.write(b'hello\nworld')

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Executor(('127.0.0.1', s['port']), loop=loop):
                b = db.read_text('hdfs:///tmp/test/*.txt')
                assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
Exemple #43
0
def test_read_text_sync(loop):
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/data.txt' % basedir, 'wb') as f:
            f.write(b'hello\nworld')

        with cluster(nworkers=3) as (s, [a, b, c]):
            with Client(('127.0.0.1', s['port']), loop=loop):
                b = db.read_text('hdfs://%s/*.txt' % basedir)
                assert list(b.str.strip().str.upper()) == ['HELLO', 'WORLD']
Exemple #44
0
def test__read_text_json_endline(e, s, a):
    import json
    with make_hdfs() as (hdfs, basedir):
        with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f:
            f.write(b'{"x": 1}\n{"x": 2}\n')

        b = db.read_text('hdfs://%s/text.1.txt' % basedir).map(json.loads)
        result = yield e.compute(b)._result()

        assert result == [{"x": 1}, {"x": 2}]
Exemple #45
0
def test__read_text_unicode(e, s, a, b):
    data = b'abcd\xc3\xa9'
    with make_hdfs() as (hdfs, basedir):
        fn = '%s/data.txt' % basedir
        with hdfs.open(fn, 'wb') as f:
            f.write(b'\n'.join([data, data]))

        f = db.read_text('hdfs://' + fn, collection=False)
        result = yield e.compute(f[0])._result()
        assert len(result) == 2
        assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2
        assert len(result[0].strip()) == 5
Exemple #46
0
def read_text(path, encoding='utf-8', errors='strict', lineterminator='\n',
               executor=None, hdfs=None, lazy=True, collection=True):
    warn("hdfs.read_text moved to dask.bag.read_text('hdfs://...')")
    import dask.bag as db
    result = db.read_text('hdfs://' + path, encoding=encoding, errors=errors,
            linedelimiter=lineterminator, hdfs=hdfs, collection=collection)

    executor = default_executor(executor)
    ensure_default_get(executor)
    if not lazy:
        if collection:
            result = executor.persist(result)
        else:
            result = executor.compute(result)

    return result
Exemple #47
0
def read_text(fn, keyname=None, encoding='utf-8', errors='strict',
        lineterminator='\n', executor=None, fs=None, lazy=True,
        collection=True, blocksize=2**27, compression=None):
    warn("distributed.s3.read_text(...) Moved to "
         "dask.bag.read_text('s3://...')")
    if keyname is not None:
        if not keyname.startswith('/'):
            keyname = '/' + keyname
        fn = fn + keyname
    import dask.bag as db
    result = db.read_text('s3://' + fn, encoding=encoding, errors=errors,
            linedelimiter=lineterminator, collection=collection,
            blocksize=blocksize, compression=compression, s3=fs)
    executor = default_executor(executor)
    ensure_default_get(executor)
    if not lazy:
        if collection:
            result = executor.persist(result)
        else:
            result = executor.compute(result)
    return result
import json

import dask.bag as db


bag = db.read_text(os.path.join('..', 'data', 'accounts.*.json.gz'))
js = bag.map(json.loads)


def sum_amount(total, user):
    transactions = user['transactions']
    return total + sum(transaction['amount'] for transaction in transactions)


js.foldby(key='name',
          binop=sum_amount,
          initial=0,
          combine=lambda x, y: x + y,
          combine_initial=0).compute()
Exemple #49
0
def bag_to_iterator(x, **kwargs):
    return db.read_text([tf.path for tf in x])