def test_parses_figures_to_superpositions():
    "confirm figures yield expected superpositions"
    superpositions = (Superpositions.of_valid_figures(), Superpositions.of_flawed_figures())
    expected_superpositions = pipe(superpositions, concat, tuple)
    figures = (Figures.valid(), Figures.flawed())
    found_superpositions = pipe(figures, concat, superpositions_from_figures, tuple)
    assert expected_superpositions == found_superpositions
Beispiel #2
0
def parser(filename, *args, **kwargs):
    g = nx.DiGraph()
    tz.pipe(filename, c_open(mode='r'),
            c.map(str.strip),
            c.map(c_split(sep=',')),
            g.add_edges_from)
    return g
Beispiel #3
0
def process(paths, load_, transform_, filter_, sink_):
    """ Generic pipeline

    :param paths: input paths
    :param load_: data loading function
    :param transform_: transformation function
    :param filter_: filter functions
    :param sink_: output function
    :return:
    """
    for path in paths:
        pipe(path, load_, transform_, filter_, sink_)
def do_localizer_block(event_listener, target):
    start = time.time()
    target_is_face = target == BlockTarget.FACE
    stim_orientations = get_stim_1_orientations()
    source = res.faces() if target_is_face else res.houses()
    stim_list = pipe(random_elem(source), 
                     take(len(stim_orientations)), 
                     list)
    face_list, house_list = flip_if(not target_is_face, (
                                        lmap(lambda ori, stim: stim[ori], 
                                            stim_orientations,
                                            stim_list), 
                                        [None] * len(stim_list))) 
    display_onsets, decisions, decision_onsets, RTs, ITIs = \
        do_trials(event_listener, face_list, house_list)
    return {
        "time": (start, time.time()),
        "target": target.name
    }, {
        "presentations_onset": display_onsets,
        "decision_onset": decision_onsets,
        "decision": [ori.name if ori else "None" for ori in decisions],
        "RT": RTs,
        "following_ITI": ITIs,
        "stim_orientation": [ori.name for ori in stim_orientations],
        "stim_id": [stim.name for stim in stim_list]
    }
Beispiel #5
0
def main(input_path: str, do_delete: bool, raw_exts: Tuple[str],
         processed_exts: Tuple[str]) -> None:
    """Trawls the given directory for raw and processed images.
    Where it finds a raw image with a numeric index that can't be
    found in the processed images it is marked for removal.

    Any sequence of 3 to 6 (inc) numbers in the image filename is deemed as its index which is used
    to associate processed and raw images.

    Processed images may also have a filename format with a range of indexes, e.g. IMG_01234-1236.jpg
    This processed file would be associated with IMG_01234.cr3, IMG_01235.cr3 and IMG_01236.cr3 raw images
    thus ensuring they are not deleted. This is useful for HDR or panoramic processed images.
    """
    pipe(input_path, directory_walker(list(raw_exts + processed_exts)),
         purge(list(raw_exts), indexer),
         deleter if do_delete else fake_deleter)
Beispiel #6
0
    def process_node(node):
        nonlocal current_base
        children = node.get('children')
        if children:
            # Sub-total bar
            new_children = pipe(children, map(process_node), filter(None),
                                list)
            if new_children:
                return merge(
                    node, {
                        'bar_type': 'sub_total',
                        'base': new_children[0]["base"],
                        'children': new_children,
                        'value': sum(child["value"] for child in new_children),
                    })
        elif node["value"]:
            # Value bar
            value_bar = merge(node, {
                'bar_type': "value",
                'base': current_base,
            })
            current_base = current_base + node["value"]
            return value_bar

        # Leaf node with value of 0, skip
        return None
Beispiel #7
0
def entity_cooccurrence(entities, tweets, vocab):
    """Creates a cooccurrence matrix of entities in tweets

    :param entities: List of entity types to include
    :param tweets: Iterator of tweets
    :param vocab: Dictionary mapping terms to index
    :returns: A sparse matrix with the occurrences

    >>> # requires a vocab dictionary, which can be
    >>> # created from the `entity_counts`.
    >>>
    >>> tweets = prep_tweets(tweets)
    >>>
    >>> counts = entity_counts(['urls', 'hashtags'], tweets)
    >>> terms = {k for k, v in counts.items() if v > 5}
    >>> vocab = {k:i for i,k in enumerate(terms)}
    >>>
    >>> entity_cooccurrence(['urls', 'hashtags'], tweets, vocab)

    """

    fns = [
        filter(lambda x: x is not None),
        map(select_ents(entities)),
        map(flatten),
        map(uniq),
        map(lambda l: [x for x in l if x in vocab]),
        map(lambda d: permutations(d, 2)), flatten,
        partial(encode_tuples, vocab), co_matrix
    ]

    return pipe(tweets, *fns)
Beispiel #8
0
def prep_tweets(tweets):
    """Prepares tweets for entity analysis

    This function treets retweets as tweets and
    removes all duplicates. Thus, if tweet A
    is retweeted 10 times in the corpus,
    it will only show up once in the
    tweets returned by prep_tweets.

    :param tweets: iterator of tweets
    :returns: generator of entities

    >>> raw_tweets = [{'id': 2345, 'entities': [], ...}
    >>>               {'id': 9874, 'entities': [], ...}]
    >>>
    >>> tweets = prep_tweets(raw_tweets)
    """

    pipeline = [
        map(replace_retweets), deduplicate,
        map(handle_truncated),
        map(simplify_entities),
        map(get_in(['entities']))
    ]

    return pipe(tweets, *pipeline)
Beispiel #9
0
def compute_down(expr, data, **kwargs):
    """ Compile a blaze expression to a sparksql expression"""
    leaves = expr._leaves()

    # make sure we only have a single leaf node
    if len(leaves) != 1:
        raise ValueError('Must compile from exactly one root database')

    leaf, = leaves

    # field expressions on the database are Field instances with a record
    # measure whose immediate child is the database leaf
    tables = pipe(expr._subterms(), filter(istable(leaf)), list)

    # raise if we don't have tables in our database
    if not tables:
        raise ValueError('Expressions not referencing a table cannot be '
                         'compiled')

    # make new symbols for each table
    new_leaves = [symbol(t._name, t.dshape) for t in tables]

    # sub them in the expression
    expr = expr._subs(dict(zip(tables, new_leaves)))

    # compute using sqlalchemy
    scope = dict(zip(new_leaves, map(make_sqlalchemy_table, tables)))
    query = compute(expr, scope)

    # interpolate params
    compiled = literalquery(query, dialect=HiveDialect())
    return data.sql(str(compiled))
Beispiel #10
0
def gender_from_bam(bam_path, prefix=''):
    """Predict the gender from a BAM alignment file.

  Args:
    bam_path (path): path to a BAM alignment file
    prefix (str, optional): string to prefix to 'X', 'Y'

  Returns:
    Gender: tuple of X coverage, Y coverage, and sex prediction

  Examples:
    >>> gender_from_bam('alignment.bam', prefix='chr')
    Gender(x_coverage=123.31, y_coverage=0.13, sex='female')
  """
    # setup: connect to a BAM file
    bam = BamFile(bam_path)

    # step 0: fake some BED interval rows (already 1,1-based!)
    fake_bed_rows = [("%sX" % prefix, 1, 59373566),
                     ("%sY" % prefix, 69362, 11375310)]

    # step 1: run the pipeline
    sequence = pipe(fake_bed_rows, map(lambda interval: bam(*interval)),
                    map(average))

    # step: make the prediction
    x_coverage, y_coverage = list(sequence)
    sex = predict_gender(x_coverage, y_coverage)
    return Gender(x_coverage, y_coverage, sex)
Beispiel #11
0
def md_link_to_html(txt, other_browser=True):
    """
    In txt, change Markdown formated links to HTML.
    (Only links are touched)

    >>> md_link_to_html("derp ferp (f) [t](l).")
    'derp ferp (f) <a target="_blank" href="l">t</a>.'

    >>> md_link_to_html("derp ferp (f) [t](l).", False)
    'derp ferp (f) <a  href="l">t</a>.'
    """

    # Because it's unlikely that links are duplicated, don't worry about
    # duplicates. Also, md_link_to_html is idempotent. And, this is only intended
    # for small local jobs so it will be plenty fast.
    return tlz.pipe(
        txt,
        parse_links,
        ctlz.map(
            lambda tandl: {
                "html": build_html_link(tandl[0], tandl[1], other_browser),
                "md": build_md_link(tandl[0], tandl[1])
            }),
        lambda links: tlz.compose(*map(
            lambda l: lambda text: text.replace(l['md'], l['html']), links)),
        lambda f: f(txt),
    )
Beispiel #12
0
def test_dict_to_json():
    """Test to_json
    - make certain the filename is deterministic
    - make certain the file contents match the data
    """
    data = _create_data_with_values(10)
    try:
        result1 = pipe(data, to_json)
        result2 = pipe(data, to_json)
        filename = result1["url"]
        output = pd.read_json(filename).to_dict(orient="records")
    finally:
        os.remove(filename)

    assert result1 == result2
    assert data == {"values": output}
Beispiel #13
0
def test_dataframe_to_json():
    """Test to_json
    - make certain the filename is deterministic
    - make certain the file contents match the data
    """
    data = _create_dataframe(10)
    try:
        result1 = pipe(data, to_json)
        result2 = pipe(data, to_json)
        filename = result1["url"]
        output = pd.read_json(filename)
    finally:
        os.remove(filename)

    assert result1 == result2
    assert output.equals(data)
Beispiel #14
0
def get_comment(identifier):
    with suppress(PostDoesNotExist):
        return pipe(
            Post(identifier).export(),
            strip_dot_from_keys,
            safe_json_metadata
        )
Beispiel #15
0
    def plot(self,
             gpu_measurement='sm',
             num_gpus=1,
             plot_width=600,
             plot_height=400,
             y_range=(0, 110)):
        """ Plot the specified GPU measurement

        Parameters
        ----------
        gpu_measurement: GPU measurement to plot possible values
        num_gpus: Number of GPUs to plot ['pwr', 'temp', 'sm', 'mem', 'enc', 'dec', 'mclk', 'pclk']
        plot_width:
        plot_height:
        y_range:

        Returns
        -------
        Bokeh Figure
        """
        df = pipe(self._log_file, parse_log, extract(gpu_measurement))
        return plot(df,
                    num_gpus=num_gpus,
                    plot_width=plot_width,
                    plot_height=plot_height,
                    y_range=y_range)
Beispiel #16
0
def overlap_internal(x, axes):
    """ Share boundaries between neighboring blocks

    Parameters
    ----------

    x: da.Array
        A dask array
    axes: dict
        The size of the shared boundary per axis

    The axes input informs how many cells to overlap between neighboring blocks
    {0: 2, 2: 5} means share two cells in 0 axis, 5 cells in 2 axis
    """
    dims = list(map(len, x.chunks))
    expand_key2 = partial(expand_key, dims=dims, axes=axes)

    # Make keys for each of the surrounding sub-arrays
    interior_keys = pipe(x.__dask_keys__(), flatten, map(expand_key2),
                         map(flatten), concat, list)

    name = 'overlap-' + tokenize(x, axes)
    getitem_name = 'getitem-' + tokenize(x, axes)
    interior_slices = {}
    overlap_blocks = {}
    for k in interior_keys:
        frac_slice = fractional_slice((x.name, ) + k, axes)
        if (x.name, ) + k != frac_slice:
            interior_slices[(getitem_name, ) + k] = frac_slice
        else:
            interior_slices[(getitem_name, ) + k] = (x.name, ) + k
            overlap_blocks[(name, ) + k] = (
                concatenate3,
                (concrete, expand_key2((None, ) + k, name=getitem_name)),
            )

    chunks = []
    for i, bds in enumerate(x.chunks):
        depth = axes.get(i, 0)
        if isinstance(depth, tuple):
            left_depth = depth[0]
            right_depth = depth[1]
        else:
            left_depth = depth
            right_depth = depth

        if len(bds) == 1:
            chunks.append(bds)
        else:
            left = [bds[0] + left_depth]
            right = [bds[-1] + right_depth]
            mid = []
            for bd in bds[1:-1]:
                mid.append(bd + left_depth + right_depth)
            chunks.append(left + mid + right)

    dsk = merge(interior_slices, overlap_blocks)
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x])

    return Array(graph, name, chunks, meta=x)
Beispiel #17
0
    def get_discrete_split_value(arr: np.ndarray, y: np.ndarray,
                                 eval_func: Callable):
        """
        Function to get the value of making a discrete split.

        Parameter:
        ----------
        arr : np.ndarray
            The feature array

        y : np.ndarray
            The target array

        eval_func : Callable
            The function to evaluate the splits.
        """

        # First element is the weighted average eval_func of the split
        # Second term is the intrinsic value to penalize many splits.
        return (
            sum([
                eval_func(y[arr == value]) * np.sum(arr == value) / len(y)
                for value in set(arr)
            ]),
            -1 * sum([
                pipe(
                    np.sum(arr == value) / len(y),
                    lambda ratio: ratio * np.log(ratio),
                ) for value in set(arr)
            ]),
        )
Beispiel #18
0
def cli(board_source, key, token, to, output, board):
    """Hi, I'm TrelloScribe. I take Trello boards and turn them into documents!"""
    # Compose a sequence of functions based on the options chosen
    # Note toolz.compose() works right to left
    read_phase = {
        'id': download_board(key, token),
        'name': toolz.compose(download_board(key, token), search_boards(key, token)),
        'file': read_board
    }
    convert_phase = {
        'raw': partial(json.dumps, indent=2),
        'md': ast_to_md,
        'html': toolz.compose(md_to_html, ast_to_md)
    }
    toolz.pipe(board, read_phase[board_source], trello_to_ast,
               convert_phase[to], partial(click.echo, file=output))
Beispiel #19
0
def ccds_to_bed(ccds_stream):
  """Convert CCDS dump to Chanjo-style BED stream.

  Main entry point for default Chanjo converter (ccds). It converts
  a sorted (start, chrom) CCDS database to the Chanjo BED-format.

  Args:
    ccds_stream (file): file handle to read CCDS lines from

  Yields:
    Interval: interval with merged block and superblock ids
  """
  return pipe(
    ccds_stream,
    filter(grep('Public')),                    # filter out Public tx
    map(text_type.rstrip),                     # strip \n and spaces
    map(split(sep='\t')),                      # split into list
    map(extract_intervals),                    # convert to Interval
    concat,                                    # flatten
    map(rename_sex_interval),                  # rename sex contigs
    partial(lazy_groupby, key=attrgetter('contig')),  # group by contig
    pluck(1),                                  # extract second item
    map(groupby(attrgetter('name'))),          # non-lazy group by id
    map(valmap(merge_related_elements)),       # group intervals
    map(itervalues),                           # extract values
    map(partial(sorted, key=attrgetter('start'))),  # sort by start pos
    concat                                     # flatten
  )
Beispiel #20
0
def streaming_pca(samples, n_components=2, batch_size=50):
    ipca = decomposition.IncrementalPCA(n_components=n_components,
                                        batch_size=batch_size)
    _ = list(tz.pipe(samples, curried.partition(batch_size),
                     curried.map(np.array),
                     curried.map(ipca.partial_fit)))
    return ipca
def __get_rows(data, max_length_per_column):
    return pipe(
        data,
        iterkeys,
        map(lambda key: __get_row(data, key, max_length_per_column)),
        reduce(lambda x, y: x + y)
    )
 def __get_all_metrics_for_each_class(self):
     def __get_all_metrics_for_class(confusion_table):
         return pmap({
             str(confusion_table.get_class_name()): pmap({
                 "Accuracy": confusion_table.accuracy,
                 "Precision": confusion_table.precision,
                 "Recall": confusion_table.recall,
                 "Specificity": confusion_table.specificity,
                 "F1score": confusion_table.f1score,
                 "Fall Out": confusion_table.fall_out,
                 "Miss Rate": confusion_table.miss_rate,
                 "False Discovery Rate": confusion_table.FDR,
                 "False Omission Rate": confusion_table.FOR,
                 "Negative Predictive Value": confusion_table.NPV,
                 "Positive Likelihood Ratio": confusion_table.PLR,
                 "Negative Likelihood Ratio": confusion_table.NLR,
                 "Diagnostic Odds Ratio": confusion_table.DOR,
             })
         })
     return pipe(
         self.__confusion_tables,
         itervalues,
         map(__get_all_metrics_for_class),
         reduce(lambda x, y: x + y),
     )
Beispiel #23
0
def freq(tokenset):
    """
    Find number of occurrences of each value 'tokenset'.
    """
    return tlz.pipe(tokenset,
                    tlz.frequencies,
                    dict.items)
Beispiel #24
0
def compute_down(expr, data, **kwargs):
    """ Compile a blaze expression to a sparksql expression"""
    leaves = expr._leaves()

    # make sure we only have a single leaf node
    if len(leaves) != 1:
        raise ValueError('Must compile from exactly one root database')

    leaf, = leaves

    # field expressions on the database are Field instances with a record
    # measure whose immediate child is the database leaf
    tables = pipe(expr._subterms(), filter(istable(leaf)), list)

    # raise if we don't have tables in our database
    if not tables:
        raise ValueError('Expressions not referencing a table cannot be '
                         'compiled')

    # make new symbols for each table
    new_leaves = [symbol(t._name, t.dshape) for t in tables]

    # sub them in the expression
    expr = expr._subs(dict(zip(tables, new_leaves)))

    # compute using sqlalchemy
    scope = dict(zip(new_leaves, map(make_sqlalchemy_table, tables)))
    query = compute(expr, scope)

    # interpolate params
    compiled = literalquery(query, dialect=HiveDialect())
    return data.sql(str(compiled))
 def count_predictions(filtered_predictions_list, target_label):
     return pipe(
         filtered_predictions_list,
         filter(lambda (_, x): x == target_label),
         list,
         len
     )
def load_all_users():
    ''' Returns a pd.DataFrame with the information of all the users'''
    map = tlz.curry(map)
    dataset = tlz.pipe(users, map(parse_exp03_filename), map(user_pipe),
                       accumulate_users)
    dataset.insert(0, 'user', sorted(users * 3))
    return dataset
Beispiel #27
0
def get(dsk, keys, optimizations=[fuse], num_workers=cpu_count):
    """ Multiprocessed get function appropriate for Bags """
    pool = _globals['pool']
    if pool is None:
        pool = multiprocessing.Pool(psutil.cpu_count())
        cleanup = True
    else:
        cleanup = False

    manager = multiprocessing.Manager()
    queue = manager.Queue()

    apply_async = dill_apply_async(pool.apply_async)

    # Optimize Dask
    dsk2 = pipe(dsk, partial(cull, keys=keys), *optimizations)

    try:
        # Run
        result = get_async(apply_async, cpu_count, dsk2, keys,
                           queue=queue)
    finally:
        if cleanup:
            pool.close()
    return result
Beispiel #28
0
def ghost_internal(x, axes):
    """ Share boundaries between neighboring blocks

    Parameters
    ----------

    x: da.Array
        A dask array
    axes: dict
        The size of the shared boundary per axis

    The axes dict informs how many cells to overlap between neighboring blocks
    {0: 2, 2: 5} means share two cells in 0 axis, 5 cells in 2 axis
    """
    dims = list(map(len, x.blockdims))
    expand_key2 = partial(expand_key, dims=dims)
    interior_keys = pipe(x._keys(), flatten,
                                    map(expand_key2), map(flatten),
                                    concat, list)
    interior_slices = dict((k, fractional_slice(k, axes))
                            for k in interior_keys)

    shape = (3,) * x.ndim
    name = next(ghost_names)
    ghost_blocks = dict(((name,) + k[1:],
                         (rec_concatenate, (concrete, expand_key2(k))))
                        for k in interior_keys)

    blockdims = [  [bds[0] + axes.get(i, 0)]
                 + [bd + axes.get(i, 0) * 2 for bd in bds[1:-1]]
                 + [bds[-1] + axes.get(i, 0)]
                 for i, bds in enumerate(x.blockdims)]

    return Array(merge(interior_slices, ghost_blocks, x.dask),
                 name, blockdims=blockdims)
Beispiel #29
0
def ngram_tuples(n, string, minlen=3, maxlen=25):
    """
    Creates ngram tuples of size 'n' from 'string'.
    Also, changes string to lowercase, removes generic stopwords and splits on all non alphanumeric.

    Ex:
        In [2]: list(ngram_tuples(n=1, string='Just another example text.'))
        Out[2]: [('another',), ('example',), ('text',)]

        In [2]: list(ngram_tuples(n=2, string='Just another example text.'))
        Out[2]: [('another', 'example'), ('example', 'text')]

        In [11]: list(ngram_tuples(3, 'I needed a longer example text for this example.'))
        Out[11]:
            [('needed', 'longer', 'example'),
             ('longer', 'example', 'text'),
             ('example', 'text', 'example')]


    minlen - filter out words that have fewer characters than 'minlen'.
    maxlen - filter out words that have more characters than 'maxlen'.
    """
    return tlz.pipe(string,
                    lower,
                    simple_split,
                    filter_longer_than(maxlen),
                    tlz.compose(tlz.concat, map_c(splitter_of_words)),
                    filter_shorter_than(minlen),
                    filter_stopwords,
                    sliding_window_c(n))
Beispiel #30
0
def compute_up(expr, data, **kwargs):
    if not valid_grouper(expr):
        raise TypeError("Grouper must have a non-nested record or one "
                        "dimensional collection datashape, "
                        "got %s of type %r with dshape %s" %
                        (expr.grouper, type(expr.grouper).__name__, expr.dshape))

    s = alias_it(data)

    if valid_reducer(expr.apply):
        reduction = compute(expr.apply, s, post_compute=False)
    else:
        raise TypeError('apply must be a Summary expression')

    grouper = get_inner_columns(compute(expr.grouper, s, post_compute=False))
    reduction_columns = pipe(reduction.inner_columns,
                             map(get_inner_columns),
                             concat)
    columns = list(unique(chain(grouper, reduction_columns)))
    if (not isinstance(s, sa.sql.selectable.Alias) or
            (hasattr(s, 'froms') and isinstance(s.froms[0],
                                                sa.sql.selectable.Join))):
        assert len(s.froms) == 1, 'only a single FROM clause supported for now'
        from_obj, = s.froms
    else:
        from_obj = None

    return reconstruct_select(columns,
                              getattr(s, 'element', s),
                              from_obj=from_obj,
                              group_by=grouper)
Beispiel #31
0
def gender_from_bam(bam_path, prefix=''):
  """Predict the gender from a BAM alignment file.

  Args:
    bam_path (path): path to a BAM alignment file
    prefix (str, optional): string to prefix to 'X', 'Y'

  Returns:
    Gender: tuple of X coverage, Y coverage, and sex prediction

  Examples:
    >>> gender_from_bam('alignment.bam', prefix='chr')
    Gender(x_coverage=123.31, y_coverage=0.13, sex='female')
  """
  # setup: connect to a BAM file
  bam = BamFile(bam_path)

  # step 0: fake some BED interval rows (already 1,1-based!)
  fake_bed_rows = [("%sX" % prefix, 1, 59373566),
                   ("%sY" % prefix, 69362, 11375310)]

  # step 1: run the pipeline
  sequence = pipe(
    fake_bed_rows,
    map(lambda interval: bam(*interval)),
    map(average)
  )

  # step: make the prediction
  x_coverage, y_coverage = list(sequence)
  sex = predict_gender(x_coverage, y_coverage)
  return Gender(x_coverage, y_coverage, sex)
Beispiel #32
0
def alpino(doc, output="raw"):
    """Wrapper around the Alpino (dependency) parser for Dutch.

    Expects an environment variable ALPINO_HOME to point at
    the Alpino installation dir.

    The script uses the 'dependencies' end_hook to generate lemmata and
    the dependency structure.

    Parameters
    ----------
    output : string
        If 'raw', returns the raw output from Alpino itself.
        If 'saf', returns a SAF dictionary.

    References
    ----------
    `Alpino homepage <http://www.let.rug.nl/vannoord/alp/Alpino/>`_
    """
    from ._alpino import tokenize, parse_raw, interpret_parse

    try:
        transf = {"raw": identity, "saf": interpret_parse}[output]
    except KeyError:
        raise ValueError("Unknown output format %r" % output)

    return pipe(doc, fetch, tokenize, parse_raw, transf)
Beispiel #33
0
def main():
    transforms = [
        t.parentdir_expand,
        t.unambiguous_path,
        t.physical_path
    ]
    print(pipe(sys.argv[1], *transforms))
def test__filter_stopwords(tokenset, count):
    assert(tlz.pipe(tokenset,
                    utils.filter_stopwords,
                    list,
                    len,
                    lambda length: length == count,
                    ))
Beispiel #35
0
    def get_min_across_splits_continuous(arr: np.ndarray, y: np.ndarray,
                                         splits: np.ndarray,
                                         eval_func: Callable):
        """
        Function to get the best split across many proposed
        splits.


        Parameters:
        -----------
        arr : np.ndarray
            The feature array to split on

        y : np.ndarray
            The target array

        splits : np.ndarray
            The proposed set of split values.

        eval_func : Callable
            The function to evaluate the split on the target
        """
        n = len(splits)
        if n > 500:
            # If many split points, use some threading
            with multiprocessing.Pool(processes=8) as p:
                # Get evaluation scores across all the splits
                post_split_evals = dict(
                    zip(
                        range(len(splits)),
                        p.starmap(
                            BaseTree.get_split_goodness_fit_continuous,
                            zip([arr] * n, [y] * n, splits, [eval_func] * n),
                        ),
                    ))
                p.close()
        else:
            # If not too many split points, get scores across all splits
            post_split_evals = dict(
                zip(
                    range(len(splits)),
                    map(
                        lambda x: BaseTree.get_split_goodness_fit_continuous(*x
                                                                             ),
                        zip([arr] * n, [y] * n, splits, [eval_func] * n),
                    ),
                ))
        # Get the minimum split based on gain ratio
        min_eval = min(
            post_split_evals,
            key=lambda x: pipe(
                post_split_evals.get(x),
                lambda results: results[0] / results[
                    1],  # entropy / intrinsic value
            ),
        )

        # Return the best split and the splits scores
        return (splits[min_eval], *post_split_evals.get(min_eval))
def get(dsk,
        keys,
        optimizations=[],
        num_workers=None,
        func_loads=None,
        func_dumps=None,
        **kwargs):
    """ Multiprocessed get function appropriate for Bags

    Parameters
    ----------

    dsk: dict
        dask graph
    keys: object or list
        Desired results from graph
    optimizations: list of functions
        optimizations to perform on graph before execution
    num_workers: int
        Number of worker processes (defaults to number of cores)
    func_dumps: function
        Function to use for function serialization
        (defaults to cloudpickle.dumps)
    func_loads: function
        Function to use for function deserialization
        (defaults to cloudpickle.loads)
    """
    pool = _globals['pool']
    if pool is None:
        pool = multiprocessing.Pool(num_workers)
        cleanup = True
    else:
        cleanup = False

    manager = multiprocessing.Manager()
    queue = manager.Queue()

    apply_async = pickle_apply_async(pool.apply_async,
                                     func_dumps=func_dumps,
                                     func_loads=func_loads)

    # Optimize Dask
    dsk2, dependencies = cull(dsk, keys)
    dsk3, dependencies = fuse(dsk2, keys, dependencies)
    dsk4 = pipe(dsk3, *optimizations)

    try:
        # Run
        result = get_async(apply_async,
                           len(pool._pool),
                           dsk3,
                           keys,
                           queue=queue,
                           get_id=_process_get_id,
                           **kwargs)
    finally:
        if cleanup:
            pool.close()
    return result
Beispiel #37
0
def extract_repo_name_from_origin(origin):
    return pipe(
        [r':([^/]*?)/([^/]*?)\.git$', r'/([^/]*?)/([^/]*?)$'],
        map(lambda x: re.search(x, origin)),
        filterempty,
        map(lambda x: (x.group(1), x.group(2))),
        first(None),
    )
Beispiel #38
0
def read_cv_image_from(url):
    """Read an image from url or file as grayscale opencv image."""

    return toolz.pipe(
        url,
        urllib.request.urlopen if is_url(url) else lambda x: open(x, 'rb'),
        lambda x: x.read(), bytearray, lambda x: np.asarray(x, dtype="uint8"),
        lambda x: cv.imdecode(x, cv.IMREAD_COLOR))
def format_dict_as_grid(data):
    max_length_per_column = __calculate_max_str_length_per_column(data)
    separator = __get_row_separator(max_length_per_column)
    headers = __get_header(
        pipe(data, itervalues, __first, iterkeys), max_length_per_column
    )
    rows = __get_rows(data, max_length_per_column)
    return StringValueObject(separator + headers + separator + rows)
def preprocessing(text: str) -> str:
    """
        preprocess text by remove symbol, stopword & stemming (ID)
        :parameter text: str
        :return: str, preprocessed text
    """
    return pipe(text, PreprocessUtil.symbol_remover,
                PreprocessUtil.stopword_remover, PreprocessUtil.stemmer)
Beispiel #41
0
def ngram_tuples(n, string, minlen=3, maxlen=25):
    return tlz.pipe(string,
                    utils.lower,
                    utils.splitter_of_words,
                    utils.filter_whitespace,
                    utils.filter_shorter_than(minlen),
                    utils.filter_longer_than(maxlen),
                    sliding_window_c(n))
Beispiel #42
0
 def cluster_similar_git_authors(self, authors):
     similar_authors = pipe(authors,
                            self._pair_authors,
                            self._compute_author_similarity,
                            self._select_similar_authors)
     join_similar_query = self.query_factory.join_similar_authors(
         similar_authors)
     self.invoker.run(join_similar_query).result(self.query_timeout)
Beispiel #43
0
 def getter(endpoint: IdResourceEndpoint):
     return pipe(
         endpoint(name, id).get(),
         IdResourceEndpoint.from_single_response(
             form_key=form_key,
             id_key=id_key,
             unpack_f=unpack_f,
         ))
Beispiel #44
0
def pfilter(f, it):
    return toolz.pipe(
        it,
        bifurcate(pmap(f, None), curried.map(toolz.identity)),
        zip,
        curried.filter(toolz.first),
        curried.map(toolz.second),
    )
Beispiel #45
0
 def _process_config(self, config: Mapping) -> Mapping:
     processed_config = pipe(
         config,
         assoc(key='Tags',
               value=merge(standard_tags(self), config.get('Tags', {}))),
         # original tags takes precedence if there is a conflict
         super()._process_config)
     return processed_config
Beispiel #46
0
def last_split_action_in_pull_request(pull_request):
    return pipe(
        pull_request.get('node', {}).get('timelineItems', {}).get('nodes', []),
        filter(lambda x: x.get('label', {}).get('name', None) ==
               split_test_label()),
        max_(lambda x: (x.get('createdAt')
                        if 'createdAt' in x else x.get('removedAt')),
             default={}))
Beispiel #47
0
 def simplified(self) -> 'Transform':
     """Return the composite of the transforms inside the transform chain."""
     if len(self) == 0:
         return None
     if len(self) == 1:
         return self[0]
     else:
         return tz.pipe(self[0], *[tf.compose for tf in self[1:]])
Beispiel #48
0
def known_iam_actions(prefix):
    """Return known IAM actions for a prefix, e.g. all ec2 actions"""
    # This could be memoized for performance improvements
    knowledge = pipe(all_known_iam_permissions(),
                     mapz(_parse_action),
                     groupbyz(lambda x: x.prefix))

    return knowledge.get(prefix, [])
Beispiel #49
0
 def _cnn_forward(self, document):
     document_tokens, mask = self.document_token_embeds_do(document)
     token_weights = self.weights(document).squeeze() * mask.float()
     normalized_weights = F.softmax(token_weights, 1)
     weighted_vectors = normalized_weights.unsqueeze(2) * document_tokens
     return pipe(weighted_vectors,
                 lambda batch: torch.transpose(batch, 1, 2), self.cnn,
                 self.relu, self.pool, torch.squeeze, self.projection)
Beispiel #50
0
    def __getitem__(self, idx: int) -> Dict[str, Any]:
        dat = {m: self.data[idx][m] for m in self.modalities}
        for m in self.modalities:
            if len(self.transforms[m]) == 0:
                continue
            dat[m] = pipe(copy.deepcopy(dat[m]), *self.transforms[m])

        return dat
Beispiel #51
0
def timeframes_to_seconds(timeframe, timeframes=1):
    t = pipe(timeframe, strings.first_number, int, math_utils.mul(timeframes))

    return (
        minutes_to_seconds(t) if is_minutely_frequency(timeframe) else
        hours_to_seconds(t) if is_hourly_frequency(timeframe) else
        days_to_seconds(t) if is_daily_frequency(timeframe) else None
    )
Beispiel #52
0
def test_to_tree_slice(serial):
    t = symbol('t', 'var * {name: string, amount: int32}')
    expr = t[:5]
    expr2 = pipe(expr,
                 partial(to_tree, names={t: 't'}),
                 serial.dumps,
                 serial.loads,
                 partial(from_tree, namespace={'t': t}))
    assert expr.isidentical(expr2)
Beispiel #53
0
 def _iter(self, usecols=None):
     from blaze.api.into import into
     dfs = self.pandas_read_csv(usecols=usecols,
                                chunksize=self.chunksize,
                                dtype='O',
                                parse_dates=[])
     return pipe(dfs, map(partial(pd.DataFrame.fillna, value='')),
                      map(partial(into, list)),
                      concat)
Beispiel #54
0
 def _iter(self, usecols=None, chunksize=None):
     from blaze.api.into import into
     chunksize = chunksize or self.chunksize
     dfs = self.pandas_read_csv(usecols=usecols,
                                chunksize=chunksize,
                                dtype='O',
                                parse_dates=[])
     return pipe(dfs, map(partial(pd.DataFrame.fillna, value='')),
                 map(partial(into, list)), concat)
def text_to_bow(parser, string):
    """
    Transforms 'string' into a bag of words representation.
    It uses the supplied 'parser' to parse the string.
    """
    return tlz.pipe(string,
                    parser,
                    utils.freq,
                    bag_of_words)
Beispiel #56
0
def row_count(width, margin_function, radius_scale, node_count):
    return pipe(
        count(1),  # possible row_counts
        curried.filter(
            lambda row_count: node_count
            <= node_count(width, margin_function, radius_scale, row_count)
        ),  # filter by row_counts that will fit node_count
        next,  # get first
    )
 def __calculate_max_column_length(column_key):
     max_value_length = pipe(
         data,
         iterkeys,
         map(lambda key: data[key][column_key]),
         pvector,
         map(str),
         map(len),
         max
     )
     return max(max_value_length, len(str(column_key)))
Beispiel #58
0
def uni_and_bigram_tuples(string, minlen=3, maxlen=25):
    return tlz.pipe(string,
                    lower,
                    simple_split,
                    filter_longer_than(maxlen),
                    tlz.compose(tlz.concat, map_c(splitter_of_words)),
                    filter_shorter_than(minlen),
                    filter_stopwords,
                    tuple,
                    tlz.juxt(sliding_window_c(1), sliding_window_c(2)),
                    tlz.interleave,
                    map_c(join_strings("_")))
def __calculate_confusion_tables(predictions_dict, confusion_table_generator, formatter):
    def calculate_confusion_table(label):
        return pmap({
            label: confusion_table_generator(predictions_dict, label, formatter)
        })

    return pipe(
        predictions_dict,
        iterkeys,
        map(calculate_confusion_table),
        merge,
        pmap
    )
 def get_label_predictions(predictions_list, all_labels, label):
     def count_predictions(filtered_predictions_list, target_label):
         return pipe(
             filtered_predictions_list,
             filter(lambda (_, x): x == target_label),
             list,
             len
         )
     filtered_predictions = pipe(
         predictions_list,
         filter(lambda (x, _): x == label)
     )
     count_predictions_partial = \
         partial(count_predictions, list(filtered_predictions))
     return pipe(
         all_labels,
         map(lambda target:
             {target: count_predictions_partial(target)}),
         map(pmap),
         merge,
         pmap
     )