Example #1
0
    def succeeded(self, event):
        command = self.started_cmds[event.request_id]
        if not command:
            return

        self.started_cmds.pop(event.request_id)

        duration = event.duration_micros
        if self.is_below_lwm(duration):
            return

        [cmd, q, meta] = take(3, command.items())
        self.render_cmd(cmd, duration, q)

        ents = pipe(
            traceback.extract_stack(),
            self.config.stack_preprocess,
            map(lambda rec: StackEntry(self.config.file_capture, *rec)),
            filter(lambda ent: ent.file_capture()),
            filter(lambda ent: len(
                list(
                    filter(lambda p: re.match(p, ent.file, re.M), self.config.
                           ignores))) == 0),
            groupby(lambda ent: ent.file),
        )
        self.render_stack(ents)
Example #2
0
def do_localizer_block(event_listener, target):
    start = time.time()
    target_is_face = target == BlockTarget.FACE
    stim_orientations = get_stim_1_orientations()
    source = res.faces() if target_is_face else res.houses()
    stim_list = pipe(random_elem(source), 
                     take(len(stim_orientations)), 
                     list)
    face_list, house_list = flip_if(not target_is_face, (
                                        lmap(lambda ori, stim: stim[ori], 
                                            stim_orientations,
                                            stim_list), 
                                        [None] * len(stim_list))) 
    display_onsets, decisions, decision_onsets, RTs, ITIs = \
        do_trials(event_listener, face_list, house_list)
    return {
        "time": (start, time.time()),
        "target": target.name
    }, {
        "presentations_onset": display_onsets,
        "decision_onset": decision_onsets,
        "decision": [ori.name if ori else "None" for ori in decisions],
        "RT": RTs,
        "following_ITI": ITIs,
        "stim_orientation": [ori.name for ori in stim_orientations],
        "stim_id": [stim.name for stim in stim_list]
    }
def compare_streams(db_engine, date_range, stream_names, allowed_parts_of_speech, max_num_words):
    """Compare tokens from each stream in the stream_names list"""

    ## Create token count dictionaries for each stream name
    count_dicts_dict = {}
    for stream_name in stream_names:
        count_dicts_dict[stream_name] = tz.pipe(
            get_content(
                db_engine, 
                stream_name,
                date_range),
            parse_content_into_count(max_num_words, allowed_parts_of_speech))

    ## Create cross-stream count dictionary
    all_streams_count_dict = reduce(
        lambda x,y: tz.merge_with(sum, x, y),
        count_dicts_dict.values())

    ## Calculate posterior probabilities of the tokens
    posterior_probs = {}
    for stream_name in stream_names:
        posterior_probs[stream_name] = tz.pipe(
            get_posterior_probs_freq(
                500, # limited to the 500 most frequent words in this stream, at this time
                all_streams_count_dict, 
                count_dicts_dict[stream_name]),
            tz.map(lambda x: tz.merge({"stream":stream_name}, x)),
            tz.take(max_num_words),
            list,
        )
    return posterior_probs
def get_top_tokens(n, count_dict):
    """Return the top n most frequent tokens in the count_dict
    If n > len(count_dict), it will just return them all"""
    return tz.pipe(
        count_dict,
        lambda x: x.items(),
        lambda x: sorted(x, key=lambda y: -y[1]),
        lambda x: tz.take(n, x),
        list)
Example #5
0
def discover_jsonlines(j, n=10, encoding='utf-8', **kwargs):
    with json_lines(j.path, encoding=encoding) as lines:
        data = pipe(lines, filter(nonempty), map(json.loads), take(n), list)

    if len(data) < n:
        ds = discover(data)
    else:
        ds = var * discover(data).subshape[0]
    return date_to_datetime_dshape(ds)
Example #6
0
def discover_jsonlines(j, n=10, encoding='utf-8', **kwargs):
    with json_lines(j.path, encoding=encoding) as lines:
        data = pipe(lines, filter(nonempty), map(json.loads), take(n), list)

    if len(data) < n:
        ds = discover(data)
    else:
        ds = var * discover(data).subshape[0]
    return date_to_datetime_dshape(ds)
def limit_layers(max_count, graphs):
    assert max_count > 0, "max count needs to > 0"

    graphs_iterator = iter(graphs)

    return tlz.concat([
        tlz.take(max_count - 1, graphs_iterator),
        # Merges all graphs remaining in the iterator, after initial
        # max_count - 1 have been taken.
        (lambda: (yield merge_graphs(graphs_iterator)))()
    ])
Example #8
0
def parse_links(txt):
    """
    Parses the contents of all Markdown links in txt.

    >>> parse_links("some text with [some](http://md.com/link)")
    [('some', 'http://md.com/link')]

    >>> parse_links("some text with [some](http://md.com/link_(bull_crap\\))")
    [('some', 'http://md.com/link_(bull_crap\\\)')]
    """
    return tlz.pipe(re.findall(MARKUP_REGEX, txt), ctlz.map(ctlz.take(2)),
                    ctlz.map(tuple), list)
Example #9
0
def export_intervals(chanjo_db, include_header=True, bed_score=0):
    r"""Return BED-formatted interval lines from existing ``chanjo_db``.

  BED lines are ready to be printed or written to a file.

  Args:
    chanjo_db (session): ``sqlalchemy.orm.session`` object with a
      ``.query``-method
    include_header (bool, optional): whether to include BED header
    bed_score (int, optional): dummy score (0-1000) to insert at field 5
      to complete the BED format

  Yields:
    str: stringified and tab-delimited interval

  Examples:
    >>> from chanjo import export_intervals, Store
    ... # instantiate a new connection to a Chanjo database
    >>> db = Store('./coverage.sqlite3')
    >>> with open('intervals.sorted.bed', 'w') as stream:
    ...   # write intervals in BED-format with appropriate headers
    ...   for bed_line in export_intervals(db):
    ...     stream.write(bed_line + '\n')
  """
    if include_header:
        yield '#chrom\tchromStart\tchromEnd\tname\tscore\tstrand'

    # setup up which columns to fetch to make BED file
    # column 5 is just a silly default for the "score" field in BED
    i = Interval  # alias
    columns = (i.contig, i.start - 1, i.end, i.id, i.strand)

    # BED files are tab-delimited
    delimiter = '\t'

    # 1. fetch interval tuples from the database (producer)
    # 2. stringify each item in each subsequence (interval tuple)
    # 3. join lines on tab-character
    # 4. prepend the header
    bed_lines = pipe(
        fetch_records(chanjo_db, columns),
        map(map(str)),  # convert fields to strings
        map(
            juxt(
                compose(list, take(4)),  # keep first 4 fields
                lambda _: [str(bed_score)],  # insert BED score
                compose(list, last))),  # keep last field
        map(concat),  # flatten each item
        map(delimiter.join)  # join on \t
    )

    for bed_line in bed_lines:
        yield bed_line
Example #10
0
def export_intervals(chanjo_db, include_header=True, bed_score=0):
  r"""Return BED-formatted interval lines from existing ``chanjo_db``.

  BED lines are ready to be printed or written to a file.

  Args:
    chanjo_db (session): ``sqlalchemy.orm.session`` object with a
      ``.query``-method
    include_header (bool, optional): whether to include BED header
    bed_score (int, optional): dummy score (0-1000) to insert at field 5
      to complete the BED format

  Yields:
    str: stringified and tab-delimited interval

  Examples:
    >>> from chanjo import export_intervals, Store
    ... # instantiate a new connection to a Chanjo database
    >>> db = Store('./coverage.sqlite')
    >>> with open('intervals.sorted.bed', 'w') as stream:
    ...   # write intervals in BED-format with appropriate headers
    ...   for bed_line in export_intervals(db):
    ...     stream.write(bed_line + '\n')
  """
  if include_header:
    yield '#chrom\tchromStart\tchromEnd\tname\tscore\tstrand'

  # setup up which columns to fetch to make BED file
  # column 5 is just a silly default for the "score" field in BED
  i = Interval  # alias
  columns = (i.contig, i.start - 1, i.end, i.id, i.strand)

  # BED files are tab-delimited
  delimiter = '\t'

  # 1. fetch interval tuples from the database (producer)
  # 2. stringify each item in each subsequence (interval tuple)
  # 3. join lines on tab-character
  # 4. prepend the header
  bed_lines = pipe(
    fetch_records(chanjo_db, columns),
    map(map(str)),                        # convert fields to strings
    map(juxt(compose(list, take(4)),      # keep first 4 fields
             lambda _: [str(bed_score)],  # insert BED score
             compose(list, last))),       # keep last field
    map(concat),                          # flatten each item
    map(delimiter.join)                   # join on \t
  )

  for bed_line in bed_lines:
    yield bed_line
Example #11
0
    def _make_samples(meta, shuffle):
        def _to_sample(person, images):
            # Random images needed for representation interpolation (3.5)
            x1 = _get_random_image()
            x2 = _get_random_image()
            return m(id=person["id_class"] - 1,
                     images=freeze(list(images)),
                     x1=freeze(x1),
                     x2=freeze(x2))

        samples = pipe(
            meta["persons"],
            tz.take(limit) if limit is not None else tz.identity,
            tz.map(lambda p: m(p=p,
                               i=tz.partition(
                                   args.N_images,
                                   _shuffled(p["images"])
                                   if shuffle else p["images"]))),
            tz.mapcat(lambda s: [_to_sample(s.p, i) for i in s.i]),
            tz.take(limit) if limit is not None else tz.identity, list)
        if shuffle:
            random.shuffle(samples)
        return samples
Example #12
0
    def __init__(self, limit=None, schema=None, keep_properties=True, chunk=False):
        self.schema = schema
        self.limit = limit
        self.chunk = chunk or False
        self.set_property_filter(keep_properties)

        # Set up pipeline, in reverse order
        steps = [self.validate, self.process]
        if self.limit is not None:
            self.logger.debug(f'Loading %s features only', self.limit)
            steps.append(take(self.limit))
        if self.chunk:
            self.logger.debug(f'Features will arrive in batches of %s', self.chunk)
            steps.append(lambda it: grouper(self.chunk, it))
        self.pipeline = compose(*reversed(steps))
Example #13
0
def stop_by_no_improvement_parallel(logs: ListLogListType,
                                    extractor: ExtractorFnType,
                                    metric_name: str,
                                    early_stop: int = 3,
                                    threshold: float = 0.001) -> bool:
    """
    Checks for logs to see if feature selection should stop

    Parameters
    ----------
    logs : list of list of dict
        A list of log-like lists of dictionaries evaluations.

    extractor: function str -> float
        A extractor that take a string and returns the value of that string on a dict

    metric_name: str
        String with the name of the column that refers to the metric column to be extracted

    early_stop: int (default 3)
        Number of iterations without improvements before stopping

    threshold: float (default 0.001)
        Threshold for model performance comparison

    Returns
    ----------
    stop: bool
        A boolean whether to stop recursion or not
    """

    if len(logs) < early_stop:
        return False

    log_list = [
        get_best_performing_log(log, extractor, metric_name) for log in logs
    ]

    limited_logs = list(take(early_stop, log_list))
    curr_auc = get_avg_metric_from_extractor(limited_logs[-1], extractor,
                                             metric_name)

    return all([
        (curr_auc - get_avg_metric_from_extractor(log, extractor, metric_name))
        <= threshold for log in limited_logs[:-1]
    ])
Example #14
0
def txt_parser(filelike, max_num_nodes=MAXINT):
    if isinstance(filelike, io.IOBase):
        fileobj = filelike
    else:  # assume filename
        fileobj = open(filelike, 'rb')
    g = nx.DiGraph()
    # this pipe assumes there are no empty lines at the start of the file
    records = tz.pipe(fileobj,
                      c.map(_decode),
                      c.partitionby(_line_is_empty),  # split on empty lines
                      c.take_nth(2),  # discard those empty lines
                      c.take(max_num_nodes),
                      c.map(get_record))
    for record in records:
        g.add_node(record['index'], attr_dict=record)
        for reference in record.get('references', []):
            g.add_edge(record['index'], reference)
    return g
Example #15
0
def sample_url_line_delimited(data, lines=5, encoding='utf-8'):
    """Get a size `length` sample from an URL CSV or URL line-delimited JSON.

    Parameters
    ----------
    data : URL(CSV)
        A hosted CSV
    lines : int, optional, default ``5``
        Number of lines to read into memory
    """

    with closing(urlopen(data.url)) as r:
        raw = pipe(r, take(lines), map(bytes.strip),
                   curry(codecs.iterdecode, encoding=encoding),
                   b'\n'.decode(encoding).join)
        with tmpfile(data.filename) as fn:
            with codecs.open(fn, 'wb', encoding=encoding) as f:
                f.write(raw)
            yield fn
Example #16
0
def sample_url_line_delimited(data, lines=5, encoding="utf-8", timeout=None):
    """Get a size `length` sample from an URL CSV or URL line-delimited JSON.

    Parameters
    ----------
    data : URL(CSV)
        A hosted CSV
    lines : int, optional, default ``5``
        Number of lines to read into memory
    """

    with closing(urlopen(data.url, timeout=timeout)) as r:
        raw = pipe(
            r, take(lines), map(bytes.strip), curry(codecs.iterdecode, encoding=encoding), b"\n".decode(encoding).join
        )
        with tmpfile(data.filename) as fn:
            with codecs.open(fn, "wb", encoding=encoding) as f:
                f.write(raw)
            yield fn
Example #17
0
    def from_file_path(cls,
                       file_path: FilePath,
                       sheet_name: str,
                       *,
                       row_limit: int = 100):
        """Help function to populate the columns of a sheet."""
        wb = get_wb(file_path)
        ws = wb[sheet_name]
        rows = tz.take(row_limit, ws.rows)
        header = next(rows)
        names = [c.value for c in header]
        letters = [c.column_letter for c in header]
        indices = [c.column for c in header]
        data_types = tz.pipe(
            rows
            # For each row, create a dict usng names has keys
            ,
            tz.map(lambda row: dict(zip(names, row)))
            # Get the .xlsx data_type for each cell
            ,
            tz.map(tz.valmap(lambda cell: cell.data_type))
            # Combine cells into a list per column
            ,
            tz.merge_with(list)
            # Count the cells for each data type in the column
            ,
            tz.valmap(tz.frequencies)
            # Consolidate types
            ,
            tz.valmap(lambda freq: (
                # If at least 1 "d"
                "date" if "d" in freq else
                # If at least 1 "s"
                "text" if "s" in freq else
                # If at least 1 "n"
                "number" if "n" in freq else str(freq))),
            lambda d: [v for k, v in d.items()])

        cols = [
            Col(name=N, letter=L, index=I, data_type=D)
            for N, L, I, D in zip(names, letters, indices, data_types)
        ]
        return cls(name=sheet_name, cols=cols)
Example #18
0
def test_take():
    assert list(take(2)([1, 2, 3])) == [1, 2]
Example #19
0
def head(data: Table, limit=100) -> Table:
    """Returns the first {limit} records of a Table."""
    return list(tz.take(limit, data))
Example #20
0
def cols2hrs24(df):
    "Convert columns from `12:00 am,  1:00 am, ...11:00 pm` to `0, 1, ...23`"
    hrs = z.pipe(range(1, 13), it.cycle, z.drop(11), z.take(12), list)
    hrs24 = ['{}:00 {}'.format(hr, half) for half in ('am', 'pm') for hr in hrs]
    assert all(df.columns[2:] == hrs24), "Expecting columns of form `12:00 am,  1:00 am, ...11:00 pm`"
    return df.rename(columns=dict(zip(hrs24, map(str, range(24)))))
Example #21
0
def test_toolz_take(executor):
    actual = executor(take(5), range(10), npartitions=3)
    assert list(actual) == [0, 1, 2, 3, 4]
Example #22
0
def edges_from_cycle(c: Cycle) -> Complex:
    return pipe(c, cycle, sliding_window(2), take(len(c)), map(pset), pset)
Example #23
0
def remove_by_feature_shuffling(log: LogType,
                                predict_fn: PredictFnType,
                                eval_fn: EvalFnType,
                                eval_data: pd.DataFrame,
                                extractor: ExtractorFnType,
                                metric_name: str,
                                max_removed_by_step: int = 50,
                                threshold: float = 0.005,
                                speed_up_by_importance: bool = False,
                                parallel: bool = False,
                                nthread: int = 1,
                                seed: int = 7) -> List[str]:
    """
        Performs feature selection based on the evaluation of the test vs the
        evaluation of the test with randomly shuffled features

        Parameters
        ----------
        log : LogType
            Dictionaries evaluations.

        predict_fn: function pandas.DataFrame -> pandas.DataFrame
            A partially defined predictor that takes a DataFrame and returns the
            predicted score for this dataframe

        eval_fn : function DataFrame -> log dict
            A partially defined evaluation function that takes a dataset with prediction and
            returns the evaluation logs.

        eval_data: pandas.DataFrame
            Data used to evaluate the model after shuffling

        extractor: function str -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        max_removed_by_step: int (default 5)
            The maximum number of features to remove. It will only consider the least max_removed_by_step in terms of
            feature importance. If speed_up_by_importance=True it will first filter the least relevant feature an
            shuffle only those. If speed_up_by_importance=False it will shuffle all features and drop the last
            max_removed_by_step in terms of PIMP. In both cases, the features will only be removed if drop in
            performance is up to the defined threshold.

        threshold: float (default 0.005)
            Threshold for model performance comparison

        speed_up_by_importance: bool (default True)
            If it should narrow search looking at feature importance first before getting PIMP importance. If True,
            will only shuffle the top num_removed_by_step in terms of feature importance.

        parallel: bool (default False)

        nthread: int (default 1)

        seed: int (default 7)
            Random seed

        Returns
        ----------
        features: list of str
            The remaining features after removing based on feature importance

    """
    random.seed(seed)

    curr_metric = get_avg_metric_from_extractor(log, extractor, metric_name)
    eval_size = eval_data.shape[0]

    features_to_shuffle = order_feature_importance_avg_from_logs(log)[-max_removed_by_step:] \
        if speed_up_by_importance else get_used_features(log)

    def shuffle(feature: str) -> pd.DataFrame:
        return eval_data.assign(
            **{feature: eval_data[feature].sample(frac=1.0)})

    feature_to_delta_metric = compose(
        lambda m: curr_metric - m,
        get_avg_metric_from_extractor(extractor=extractor,
                                      metric_name=metric_name),
        gen_validator_log(fold_num=0, test_size=eval_size), eval_fn,
        predict_fn, shuffle)

    if parallel:
        metrics = Parallel(n_jobs=nthread, backend="threading")(
            delayed(feature_to_delta_metric)(feature)
            for feature in features_to_shuffle)
        feature_to_delta_metric = dict(zip(features_to_shuffle, metrics))
        gc.collect()

    else:
        feature_to_delta_metric = {
            feature: feature_to_delta_metric(feature)
            for feature in features_to_shuffle
        }

    return pipe(feature_to_delta_metric,
                valfilter(lambda delta_metric: delta_metric < threshold),
                sorted(key=lambda f: feature_to_delta_metric.get(f)),
                take(max_removed_by_step), list)
Example #24
0
    im = ax.imshow(model, cmap='magma')
    axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.8])
    plt.colorbar(im, cax=axcolor)
    for axis in [ax.xaxis, ax.yaxis]:
        axis.set_ticks(range(8))
        axis.set_ticks_position('none')
        axis.set_ticklabels(labels)
    plt.savefig('./8_3_markov_model.png')


if __name__ == "__main__":
    dm = '../data/dm6.fa'
    dm_gz = '../data/dm6.fa.gz'
    demo = False
    if demo:
        model = tz.pipe(dm_gz, genome, c.take(10**7), markov)
    else:
        model = tz.pipe(dm_gz, genome, markov)
    print('The model is:\n')
    print('   ', '     '.join('ACGTacgt'), '\n')
    print(model)
    print('visualization ...')
    plot_model(model, labels='ACGTacgt')
    '''
    The dictionary is
    {('A', 'A'): (0, 0), ('A', 'C'): (0, 1), ('A', 'G'): (0, 2), ('A', 'T'): (0, 3), ('A', 'a'): (0, 4), ('A', 'c'): (0, 5), ('A', 'g'): (0, 6), ('A', 't'): (0, 7), ('C', 'A'): (1, 0), ('C', 'C'): (1, 1), ('C', 'G'): (1, 2), ('C', 'T'): (1, 3), ('C', 'a'): (1, 4), ('C', 'c'): (1, 5), ('C', 'g'): (1, 6), ('C', 't'): (1, 7), ('G', 'A'): (2, 0), ('G', 'C'): (2, 1), ('G', 'G'): (2, 2), ('G', 'T'): (2, 3), ('G', 'a'): (2, 4), ('G', 'c'): (2, 5), ('G', 'g'): (2, 6), ('G', 't'): (2, 7), ('T', 'A'): (3, 0), ('T', 'C'): (3, 1), ('T', 'G'): (3, 2), ('T', 'T'): (3, 3), ('T', 'a'): (3, 4), ('T', 'c'): (3, 5), ('T', 'g'): (3, 6), ('T', 't'): (3, 7), ('a', 'A'): (4, 0), ('a', 'C'): (4, 1), ('a', 'G'): (4, 2), ('a', 'T'): (4, 3), ('a', 'a'): (4, 4), ('a', 'c'): (4, 5), ('a', 'g'): (4, 6), ('a', 't'): (4, 7), ('c', 'A'): (5, 0), ('c', 'C'): (5, 1), ('c', 'G'): (5, 2), ('c', 'T'): (5, 3), ('c', 'a'): (5, 4), ('c', 'c'): (5, 5), ('c', 'g'): (5, 6), ('c', 't'): (5, 7), ('g', 'A'): (6, 0), ('g', 'C'): (6, 1), ('g', 'G'): (6, 2), ('g', 'T'): (6, 3), ('g', 'a'): (6, 4), ('g', 'c'): (6, 5), ('g', 'g'): (6, 6), ('g', 't'): (6, 7), ('t', 'A'): (7, 0), ('t', 'C'): (7, 1), ('t', 'G'): (7, 2), ('t', 'T'): (7, 3), ('t', 'a'): (7, 4), ('t', 'c'): (7, 5), ('t', 'g'): (7, 6), ('t', 't'): (7, 7)}
    The model is:

        A     C     G     T     a     c     g     t 

    [[0.351 0.181 0.189 0.279 0.    0.    0.    0.   ]