def test_parses_figures_to_superpositions(): "confirm figures yield expected superpositions" superpositions = (Superpositions.of_valid_figures(), Superpositions.of_flawed_figures()) expected_superpositions = pipe(superpositions, concat, tuple) figures = (Figures.valid(), Figures.flawed()) found_superpositions = pipe(figures, concat, superpositions_from_figures, tuple) assert expected_superpositions == found_superpositions
def parser(filename, *args, **kwargs): g = nx.DiGraph() tz.pipe(filename, c_open(mode='r'), c.map(str.strip), c.map(c_split(sep=',')), g.add_edges_from) return g
def process(paths, load_, transform_, filter_, sink_): """ Generic pipeline :param paths: input paths :param load_: data loading function :param transform_: transformation function :param filter_: filter functions :param sink_: output function :return: """ for path in paths: pipe(path, load_, transform_, filter_, sink_)
def do_localizer_block(event_listener, target): start = time.time() target_is_face = target == BlockTarget.FACE stim_orientations = get_stim_1_orientations() source = res.faces() if target_is_face else res.houses() stim_list = pipe(random_elem(source), take(len(stim_orientations)), list) face_list, house_list = flip_if(not target_is_face, ( lmap(lambda ori, stim: stim[ori], stim_orientations, stim_list), [None] * len(stim_list))) display_onsets, decisions, decision_onsets, RTs, ITIs = \ do_trials(event_listener, face_list, house_list) return { "time": (start, time.time()), "target": target.name }, { "presentations_onset": display_onsets, "decision_onset": decision_onsets, "decision": [ori.name if ori else "None" for ori in decisions], "RT": RTs, "following_ITI": ITIs, "stim_orientation": [ori.name for ori in stim_orientations], "stim_id": [stim.name for stim in stim_list] }
def main(input_path: str, do_delete: bool, raw_exts: Tuple[str], processed_exts: Tuple[str]) -> None: """Trawls the given directory for raw and processed images. Where it finds a raw image with a numeric index that can't be found in the processed images it is marked for removal. Any sequence of 3 to 6 (inc) numbers in the image filename is deemed as its index which is used to associate processed and raw images. Processed images may also have a filename format with a range of indexes, e.g. IMG_01234-1236.jpg This processed file would be associated with IMG_01234.cr3, IMG_01235.cr3 and IMG_01236.cr3 raw images thus ensuring they are not deleted. This is useful for HDR or panoramic processed images. """ pipe(input_path, directory_walker(list(raw_exts + processed_exts)), purge(list(raw_exts), indexer), deleter if do_delete else fake_deleter)
def process_node(node): nonlocal current_base children = node.get('children') if children: # Sub-total bar new_children = pipe(children, map(process_node), filter(None), list) if new_children: return merge( node, { 'bar_type': 'sub_total', 'base': new_children[0]["base"], 'children': new_children, 'value': sum(child["value"] for child in new_children), }) elif node["value"]: # Value bar value_bar = merge(node, { 'bar_type': "value", 'base': current_base, }) current_base = current_base + node["value"] return value_bar # Leaf node with value of 0, skip return None
def entity_cooccurrence(entities, tweets, vocab): """Creates a cooccurrence matrix of entities in tweets :param entities: List of entity types to include :param tweets: Iterator of tweets :param vocab: Dictionary mapping terms to index :returns: A sparse matrix with the occurrences >>> # requires a vocab dictionary, which can be >>> # created from the `entity_counts`. >>> >>> tweets = prep_tweets(tweets) >>> >>> counts = entity_counts(['urls', 'hashtags'], tweets) >>> terms = {k for k, v in counts.items() if v > 5} >>> vocab = {k:i for i,k in enumerate(terms)} >>> >>> entity_cooccurrence(['urls', 'hashtags'], tweets, vocab) """ fns = [ filter(lambda x: x is not None), map(select_ents(entities)), map(flatten), map(uniq), map(lambda l: [x for x in l if x in vocab]), map(lambda d: permutations(d, 2)), flatten, partial(encode_tuples, vocab), co_matrix ] return pipe(tweets, *fns)
def prep_tweets(tweets): """Prepares tweets for entity analysis This function treets retweets as tweets and removes all duplicates. Thus, if tweet A is retweeted 10 times in the corpus, it will only show up once in the tweets returned by prep_tweets. :param tweets: iterator of tweets :returns: generator of entities >>> raw_tweets = [{'id': 2345, 'entities': [], ...} >>> {'id': 9874, 'entities': [], ...}] >>> >>> tweets = prep_tweets(raw_tweets) """ pipeline = [ map(replace_retweets), deduplicate, map(handle_truncated), map(simplify_entities), map(get_in(['entities'])) ] return pipe(tweets, *pipeline)
def compute_down(expr, data, **kwargs): """ Compile a blaze expression to a sparksql expression""" leaves = expr._leaves() # make sure we only have a single leaf node if len(leaves) != 1: raise ValueError('Must compile from exactly one root database') leaf, = leaves # field expressions on the database are Field instances with a record # measure whose immediate child is the database leaf tables = pipe(expr._subterms(), filter(istable(leaf)), list) # raise if we don't have tables in our database if not tables: raise ValueError('Expressions not referencing a table cannot be ' 'compiled') # make new symbols for each table new_leaves = [symbol(t._name, t.dshape) for t in tables] # sub them in the expression expr = expr._subs(dict(zip(tables, new_leaves))) # compute using sqlalchemy scope = dict(zip(new_leaves, map(make_sqlalchemy_table, tables))) query = compute(expr, scope) # interpolate params compiled = literalquery(query, dialect=HiveDialect()) return data.sql(str(compiled))
def gender_from_bam(bam_path, prefix=''): """Predict the gender from a BAM alignment file. Args: bam_path (path): path to a BAM alignment file prefix (str, optional): string to prefix to 'X', 'Y' Returns: Gender: tuple of X coverage, Y coverage, and sex prediction Examples: >>> gender_from_bam('alignment.bam', prefix='chr') Gender(x_coverage=123.31, y_coverage=0.13, sex='female') """ # setup: connect to a BAM file bam = BamFile(bam_path) # step 0: fake some BED interval rows (already 1,1-based!) fake_bed_rows = [("%sX" % prefix, 1, 59373566), ("%sY" % prefix, 69362, 11375310)] # step 1: run the pipeline sequence = pipe(fake_bed_rows, map(lambda interval: bam(*interval)), map(average)) # step: make the prediction x_coverage, y_coverage = list(sequence) sex = predict_gender(x_coverage, y_coverage) return Gender(x_coverage, y_coverage, sex)
def md_link_to_html(txt, other_browser=True): """ In txt, change Markdown formated links to HTML. (Only links are touched) >>> md_link_to_html("derp ferp (f) [t](l).") 'derp ferp (f) <a target="_blank" href="l">t</a>.' >>> md_link_to_html("derp ferp (f) [t](l).", False) 'derp ferp (f) <a href="l">t</a>.' """ # Because it's unlikely that links are duplicated, don't worry about # duplicates. Also, md_link_to_html is idempotent. And, this is only intended # for small local jobs so it will be plenty fast. return tlz.pipe( txt, parse_links, ctlz.map( lambda tandl: { "html": build_html_link(tandl[0], tandl[1], other_browser), "md": build_md_link(tandl[0], tandl[1]) }), lambda links: tlz.compose(*map( lambda l: lambda text: text.replace(l['md'], l['html']), links)), lambda f: f(txt), )
def test_dict_to_json(): """Test to_json - make certain the filename is deterministic - make certain the file contents match the data """ data = _create_data_with_values(10) try: result1 = pipe(data, to_json) result2 = pipe(data, to_json) filename = result1["url"] output = pd.read_json(filename).to_dict(orient="records") finally: os.remove(filename) assert result1 == result2 assert data == {"values": output}
def test_dataframe_to_json(): """Test to_json - make certain the filename is deterministic - make certain the file contents match the data """ data = _create_dataframe(10) try: result1 = pipe(data, to_json) result2 = pipe(data, to_json) filename = result1["url"] output = pd.read_json(filename) finally: os.remove(filename) assert result1 == result2 assert output.equals(data)
def get_comment(identifier): with suppress(PostDoesNotExist): return pipe( Post(identifier).export(), strip_dot_from_keys, safe_json_metadata )
def plot(self, gpu_measurement='sm', num_gpus=1, plot_width=600, plot_height=400, y_range=(0, 110)): """ Plot the specified GPU measurement Parameters ---------- gpu_measurement: GPU measurement to plot possible values num_gpus: Number of GPUs to plot ['pwr', 'temp', 'sm', 'mem', 'enc', 'dec', 'mclk', 'pclk'] plot_width: plot_height: y_range: Returns ------- Bokeh Figure """ df = pipe(self._log_file, parse_log, extract(gpu_measurement)) return plot(df, num_gpus=num_gpus, plot_width=plot_width, plot_height=plot_height, y_range=y_range)
def overlap_internal(x, axes): """ Share boundaries between neighboring blocks Parameters ---------- x: da.Array A dask array axes: dict The size of the shared boundary per axis The axes input informs how many cells to overlap between neighboring blocks {0: 2, 2: 5} means share two cells in 0 axis, 5 cells in 2 axis """ dims = list(map(len, x.chunks)) expand_key2 = partial(expand_key, dims=dims, axes=axes) # Make keys for each of the surrounding sub-arrays interior_keys = pipe(x.__dask_keys__(), flatten, map(expand_key2), map(flatten), concat, list) name = 'overlap-' + tokenize(x, axes) getitem_name = 'getitem-' + tokenize(x, axes) interior_slices = {} overlap_blocks = {} for k in interior_keys: frac_slice = fractional_slice((x.name, ) + k, axes) if (x.name, ) + k != frac_slice: interior_slices[(getitem_name, ) + k] = frac_slice else: interior_slices[(getitem_name, ) + k] = (x.name, ) + k overlap_blocks[(name, ) + k] = ( concatenate3, (concrete, expand_key2((None, ) + k, name=getitem_name)), ) chunks = [] for i, bds in enumerate(x.chunks): depth = axes.get(i, 0) if isinstance(depth, tuple): left_depth = depth[0] right_depth = depth[1] else: left_depth = depth right_depth = depth if len(bds) == 1: chunks.append(bds) else: left = [bds[0] + left_depth] right = [bds[-1] + right_depth] mid = [] for bd in bds[1:-1]: mid.append(bd + left_depth + right_depth) chunks.append(left + mid + right) dsk = merge(interior_slices, overlap_blocks) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x]) return Array(graph, name, chunks, meta=x)
def get_discrete_split_value(arr: np.ndarray, y: np.ndarray, eval_func: Callable): """ Function to get the value of making a discrete split. Parameter: ---------- arr : np.ndarray The feature array y : np.ndarray The target array eval_func : Callable The function to evaluate the splits. """ # First element is the weighted average eval_func of the split # Second term is the intrinsic value to penalize many splits. return ( sum([ eval_func(y[arr == value]) * np.sum(arr == value) / len(y) for value in set(arr) ]), -1 * sum([ pipe( np.sum(arr == value) / len(y), lambda ratio: ratio * np.log(ratio), ) for value in set(arr) ]), )
def cli(board_source, key, token, to, output, board): """Hi, I'm TrelloScribe. I take Trello boards and turn them into documents!""" # Compose a sequence of functions based on the options chosen # Note toolz.compose() works right to left read_phase = { 'id': download_board(key, token), 'name': toolz.compose(download_board(key, token), search_boards(key, token)), 'file': read_board } convert_phase = { 'raw': partial(json.dumps, indent=2), 'md': ast_to_md, 'html': toolz.compose(md_to_html, ast_to_md) } toolz.pipe(board, read_phase[board_source], trello_to_ast, convert_phase[to], partial(click.echo, file=output))
def ccds_to_bed(ccds_stream): """Convert CCDS dump to Chanjo-style BED stream. Main entry point for default Chanjo converter (ccds). It converts a sorted (start, chrom) CCDS database to the Chanjo BED-format. Args: ccds_stream (file): file handle to read CCDS lines from Yields: Interval: interval with merged block and superblock ids """ return pipe( ccds_stream, filter(grep('Public')), # filter out Public tx map(text_type.rstrip), # strip \n and spaces map(split(sep='\t')), # split into list map(extract_intervals), # convert to Interval concat, # flatten map(rename_sex_interval), # rename sex contigs partial(lazy_groupby, key=attrgetter('contig')), # group by contig pluck(1), # extract second item map(groupby(attrgetter('name'))), # non-lazy group by id map(valmap(merge_related_elements)), # group intervals map(itervalues), # extract values map(partial(sorted, key=attrgetter('start'))), # sort by start pos concat # flatten )
def streaming_pca(samples, n_components=2, batch_size=50): ipca = decomposition.IncrementalPCA(n_components=n_components, batch_size=batch_size) _ = list(tz.pipe(samples, curried.partition(batch_size), curried.map(np.array), curried.map(ipca.partial_fit))) return ipca
def __get_rows(data, max_length_per_column): return pipe( data, iterkeys, map(lambda key: __get_row(data, key, max_length_per_column)), reduce(lambda x, y: x + y) )
def __get_all_metrics_for_each_class(self): def __get_all_metrics_for_class(confusion_table): return pmap({ str(confusion_table.get_class_name()): pmap({ "Accuracy": confusion_table.accuracy, "Precision": confusion_table.precision, "Recall": confusion_table.recall, "Specificity": confusion_table.specificity, "F1score": confusion_table.f1score, "Fall Out": confusion_table.fall_out, "Miss Rate": confusion_table.miss_rate, "False Discovery Rate": confusion_table.FDR, "False Omission Rate": confusion_table.FOR, "Negative Predictive Value": confusion_table.NPV, "Positive Likelihood Ratio": confusion_table.PLR, "Negative Likelihood Ratio": confusion_table.NLR, "Diagnostic Odds Ratio": confusion_table.DOR, }) }) return pipe( self.__confusion_tables, itervalues, map(__get_all_metrics_for_class), reduce(lambda x, y: x + y), )
def freq(tokenset): """ Find number of occurrences of each value 'tokenset'. """ return tlz.pipe(tokenset, tlz.frequencies, dict.items)
def count_predictions(filtered_predictions_list, target_label): return pipe( filtered_predictions_list, filter(lambda (_, x): x == target_label), list, len )
def load_all_users(): ''' Returns a pd.DataFrame with the information of all the users''' map = tlz.curry(map) dataset = tlz.pipe(users, map(parse_exp03_filename), map(user_pipe), accumulate_users) dataset.insert(0, 'user', sorted(users * 3)) return dataset
def get(dsk, keys, optimizations=[fuse], num_workers=cpu_count): """ Multiprocessed get function appropriate for Bags """ pool = _globals['pool'] if pool is None: pool = multiprocessing.Pool(psutil.cpu_count()) cleanup = True else: cleanup = False manager = multiprocessing.Manager() queue = manager.Queue() apply_async = dill_apply_async(pool.apply_async) # Optimize Dask dsk2 = pipe(dsk, partial(cull, keys=keys), *optimizations) try: # Run result = get_async(apply_async, cpu_count, dsk2, keys, queue=queue) finally: if cleanup: pool.close() return result
def ghost_internal(x, axes): """ Share boundaries between neighboring blocks Parameters ---------- x: da.Array A dask array axes: dict The size of the shared boundary per axis The axes dict informs how many cells to overlap between neighboring blocks {0: 2, 2: 5} means share two cells in 0 axis, 5 cells in 2 axis """ dims = list(map(len, x.blockdims)) expand_key2 = partial(expand_key, dims=dims) interior_keys = pipe(x._keys(), flatten, map(expand_key2), map(flatten), concat, list) interior_slices = dict((k, fractional_slice(k, axes)) for k in interior_keys) shape = (3,) * x.ndim name = next(ghost_names) ghost_blocks = dict(((name,) + k[1:], (rec_concatenate, (concrete, expand_key2(k)))) for k in interior_keys) blockdims = [ [bds[0] + axes.get(i, 0)] + [bd + axes.get(i, 0) * 2 for bd in bds[1:-1]] + [bds[-1] + axes.get(i, 0)] for i, bds in enumerate(x.blockdims)] return Array(merge(interior_slices, ghost_blocks, x.dask), name, blockdims=blockdims)
def ngram_tuples(n, string, minlen=3, maxlen=25): """ Creates ngram tuples of size 'n' from 'string'. Also, changes string to lowercase, removes generic stopwords and splits on all non alphanumeric. Ex: In [2]: list(ngram_tuples(n=1, string='Just another example text.')) Out[2]: [('another',), ('example',), ('text',)] In [2]: list(ngram_tuples(n=2, string='Just another example text.')) Out[2]: [('another', 'example'), ('example', 'text')] In [11]: list(ngram_tuples(3, 'I needed a longer example text for this example.')) Out[11]: [('needed', 'longer', 'example'), ('longer', 'example', 'text'), ('example', 'text', 'example')] minlen - filter out words that have fewer characters than 'minlen'. maxlen - filter out words that have more characters than 'maxlen'. """ return tlz.pipe(string, lower, simple_split, filter_longer_than(maxlen), tlz.compose(tlz.concat, map_c(splitter_of_words)), filter_shorter_than(minlen), filter_stopwords, sliding_window_c(n))
def compute_up(expr, data, **kwargs): if not valid_grouper(expr): raise TypeError("Grouper must have a non-nested record or one " "dimensional collection datashape, " "got %s of type %r with dshape %s" % (expr.grouper, type(expr.grouper).__name__, expr.dshape)) s = alias_it(data) if valid_reducer(expr.apply): reduction = compute(expr.apply, s, post_compute=False) else: raise TypeError('apply must be a Summary expression') grouper = get_inner_columns(compute(expr.grouper, s, post_compute=False)) reduction_columns = pipe(reduction.inner_columns, map(get_inner_columns), concat) columns = list(unique(chain(grouper, reduction_columns))) if (not isinstance(s, sa.sql.selectable.Alias) or (hasattr(s, 'froms') and isinstance(s.froms[0], sa.sql.selectable.Join))): assert len(s.froms) == 1, 'only a single FROM clause supported for now' from_obj, = s.froms else: from_obj = None return reconstruct_select(columns, getattr(s, 'element', s), from_obj=from_obj, group_by=grouper)
def gender_from_bam(bam_path, prefix=''): """Predict the gender from a BAM alignment file. Args: bam_path (path): path to a BAM alignment file prefix (str, optional): string to prefix to 'X', 'Y' Returns: Gender: tuple of X coverage, Y coverage, and sex prediction Examples: >>> gender_from_bam('alignment.bam', prefix='chr') Gender(x_coverage=123.31, y_coverage=0.13, sex='female') """ # setup: connect to a BAM file bam = BamFile(bam_path) # step 0: fake some BED interval rows (already 1,1-based!) fake_bed_rows = [("%sX" % prefix, 1, 59373566), ("%sY" % prefix, 69362, 11375310)] # step 1: run the pipeline sequence = pipe( fake_bed_rows, map(lambda interval: bam(*interval)), map(average) ) # step: make the prediction x_coverage, y_coverage = list(sequence) sex = predict_gender(x_coverage, y_coverage) return Gender(x_coverage, y_coverage, sex)
def alpino(doc, output="raw"): """Wrapper around the Alpino (dependency) parser for Dutch. Expects an environment variable ALPINO_HOME to point at the Alpino installation dir. The script uses the 'dependencies' end_hook to generate lemmata and the dependency structure. Parameters ---------- output : string If 'raw', returns the raw output from Alpino itself. If 'saf', returns a SAF dictionary. References ---------- `Alpino homepage <http://www.let.rug.nl/vannoord/alp/Alpino/>`_ """ from ._alpino import tokenize, parse_raw, interpret_parse try: transf = {"raw": identity, "saf": interpret_parse}[output] except KeyError: raise ValueError("Unknown output format %r" % output) return pipe(doc, fetch, tokenize, parse_raw, transf)
def main(): transforms = [ t.parentdir_expand, t.unambiguous_path, t.physical_path ] print(pipe(sys.argv[1], *transforms))
def test__filter_stopwords(tokenset, count): assert(tlz.pipe(tokenset, utils.filter_stopwords, list, len, lambda length: length == count, ))
def get_min_across_splits_continuous(arr: np.ndarray, y: np.ndarray, splits: np.ndarray, eval_func: Callable): """ Function to get the best split across many proposed splits. Parameters: ----------- arr : np.ndarray The feature array to split on y : np.ndarray The target array splits : np.ndarray The proposed set of split values. eval_func : Callable The function to evaluate the split on the target """ n = len(splits) if n > 500: # If many split points, use some threading with multiprocessing.Pool(processes=8) as p: # Get evaluation scores across all the splits post_split_evals = dict( zip( range(len(splits)), p.starmap( BaseTree.get_split_goodness_fit_continuous, zip([arr] * n, [y] * n, splits, [eval_func] * n), ), )) p.close() else: # If not too many split points, get scores across all splits post_split_evals = dict( zip( range(len(splits)), map( lambda x: BaseTree.get_split_goodness_fit_continuous(*x ), zip([arr] * n, [y] * n, splits, [eval_func] * n), ), )) # Get the minimum split based on gain ratio min_eval = min( post_split_evals, key=lambda x: pipe( post_split_evals.get(x), lambda results: results[0] / results[ 1], # entropy / intrinsic value ), ) # Return the best split and the splits scores return (splits[min_eval], *post_split_evals.get(min_eval))
def get(dsk, keys, optimizations=[], num_workers=None, func_loads=None, func_dumps=None, **kwargs): """ Multiprocessed get function appropriate for Bags Parameters ---------- dsk: dict dask graph keys: object or list Desired results from graph optimizations: list of functions optimizations to perform on graph before execution num_workers: int Number of worker processes (defaults to number of cores) func_dumps: function Function to use for function serialization (defaults to cloudpickle.dumps) func_loads: function Function to use for function deserialization (defaults to cloudpickle.loads) """ pool = _globals['pool'] if pool is None: pool = multiprocessing.Pool(num_workers) cleanup = True else: cleanup = False manager = multiprocessing.Manager() queue = manager.Queue() apply_async = pickle_apply_async(pool.apply_async, func_dumps=func_dumps, func_loads=func_loads) # Optimize Dask dsk2, dependencies = cull(dsk, keys) dsk3, dependencies = fuse(dsk2, keys, dependencies) dsk4 = pipe(dsk3, *optimizations) try: # Run result = get_async(apply_async, len(pool._pool), dsk3, keys, queue=queue, get_id=_process_get_id, **kwargs) finally: if cleanup: pool.close() return result
def extract_repo_name_from_origin(origin): return pipe( [r':([^/]*?)/([^/]*?)\.git$', r'/([^/]*?)/([^/]*?)$'], map(lambda x: re.search(x, origin)), filterempty, map(lambda x: (x.group(1), x.group(2))), first(None), )
def read_cv_image_from(url): """Read an image from url or file as grayscale opencv image.""" return toolz.pipe( url, urllib.request.urlopen if is_url(url) else lambda x: open(x, 'rb'), lambda x: x.read(), bytearray, lambda x: np.asarray(x, dtype="uint8"), lambda x: cv.imdecode(x, cv.IMREAD_COLOR))
def format_dict_as_grid(data): max_length_per_column = __calculate_max_str_length_per_column(data) separator = __get_row_separator(max_length_per_column) headers = __get_header( pipe(data, itervalues, __first, iterkeys), max_length_per_column ) rows = __get_rows(data, max_length_per_column) return StringValueObject(separator + headers + separator + rows)
def preprocessing(text: str) -> str: """ preprocess text by remove symbol, stopword & stemming (ID) :parameter text: str :return: str, preprocessed text """ return pipe(text, PreprocessUtil.symbol_remover, PreprocessUtil.stopword_remover, PreprocessUtil.stemmer)
def ngram_tuples(n, string, minlen=3, maxlen=25): return tlz.pipe(string, utils.lower, utils.splitter_of_words, utils.filter_whitespace, utils.filter_shorter_than(minlen), utils.filter_longer_than(maxlen), sliding_window_c(n))
def cluster_similar_git_authors(self, authors): similar_authors = pipe(authors, self._pair_authors, self._compute_author_similarity, self._select_similar_authors) join_similar_query = self.query_factory.join_similar_authors( similar_authors) self.invoker.run(join_similar_query).result(self.query_timeout)
def getter(endpoint: IdResourceEndpoint): return pipe( endpoint(name, id).get(), IdResourceEndpoint.from_single_response( form_key=form_key, id_key=id_key, unpack_f=unpack_f, ))
def pfilter(f, it): return toolz.pipe( it, bifurcate(pmap(f, None), curried.map(toolz.identity)), zip, curried.filter(toolz.first), curried.map(toolz.second), )
def _process_config(self, config: Mapping) -> Mapping: processed_config = pipe( config, assoc(key='Tags', value=merge(standard_tags(self), config.get('Tags', {}))), # original tags takes precedence if there is a conflict super()._process_config) return processed_config
def last_split_action_in_pull_request(pull_request): return pipe( pull_request.get('node', {}).get('timelineItems', {}).get('nodes', []), filter(lambda x: x.get('label', {}).get('name', None) == split_test_label()), max_(lambda x: (x.get('createdAt') if 'createdAt' in x else x.get('removedAt')), default={}))
def simplified(self) -> 'Transform': """Return the composite of the transforms inside the transform chain.""" if len(self) == 0: return None if len(self) == 1: return self[0] else: return tz.pipe(self[0], *[tf.compose for tf in self[1:]])
def known_iam_actions(prefix): """Return known IAM actions for a prefix, e.g. all ec2 actions""" # This could be memoized for performance improvements knowledge = pipe(all_known_iam_permissions(), mapz(_parse_action), groupbyz(lambda x: x.prefix)) return knowledge.get(prefix, [])
def _cnn_forward(self, document): document_tokens, mask = self.document_token_embeds_do(document) token_weights = self.weights(document).squeeze() * mask.float() normalized_weights = F.softmax(token_weights, 1) weighted_vectors = normalized_weights.unsqueeze(2) * document_tokens return pipe(weighted_vectors, lambda batch: torch.transpose(batch, 1, 2), self.cnn, self.relu, self.pool, torch.squeeze, self.projection)
def __getitem__(self, idx: int) -> Dict[str, Any]: dat = {m: self.data[idx][m] for m in self.modalities} for m in self.modalities: if len(self.transforms[m]) == 0: continue dat[m] = pipe(copy.deepcopy(dat[m]), *self.transforms[m]) return dat
def timeframes_to_seconds(timeframe, timeframes=1): t = pipe(timeframe, strings.first_number, int, math_utils.mul(timeframes)) return ( minutes_to_seconds(t) if is_minutely_frequency(timeframe) else hours_to_seconds(t) if is_hourly_frequency(timeframe) else days_to_seconds(t) if is_daily_frequency(timeframe) else None )
def test_to_tree_slice(serial): t = symbol('t', 'var * {name: string, amount: int32}') expr = t[:5] expr2 = pipe(expr, partial(to_tree, names={t: 't'}), serial.dumps, serial.loads, partial(from_tree, namespace={'t': t})) assert expr.isidentical(expr2)
def _iter(self, usecols=None): from blaze.api.into import into dfs = self.pandas_read_csv(usecols=usecols, chunksize=self.chunksize, dtype='O', parse_dates=[]) return pipe(dfs, map(partial(pd.DataFrame.fillna, value='')), map(partial(into, list)), concat)
def _iter(self, usecols=None, chunksize=None): from blaze.api.into import into chunksize = chunksize or self.chunksize dfs = self.pandas_read_csv(usecols=usecols, chunksize=chunksize, dtype='O', parse_dates=[]) return pipe(dfs, map(partial(pd.DataFrame.fillna, value='')), map(partial(into, list)), concat)
def text_to_bow(parser, string): """ Transforms 'string' into a bag of words representation. It uses the supplied 'parser' to parse the string. """ return tlz.pipe(string, parser, utils.freq, bag_of_words)
def row_count(width, margin_function, radius_scale, node_count): return pipe( count(1), # possible row_counts curried.filter( lambda row_count: node_count <= node_count(width, margin_function, radius_scale, row_count) ), # filter by row_counts that will fit node_count next, # get first )
def __calculate_max_column_length(column_key): max_value_length = pipe( data, iterkeys, map(lambda key: data[key][column_key]), pvector, map(str), map(len), max ) return max(max_value_length, len(str(column_key)))
def uni_and_bigram_tuples(string, minlen=3, maxlen=25): return tlz.pipe(string, lower, simple_split, filter_longer_than(maxlen), tlz.compose(tlz.concat, map_c(splitter_of_words)), filter_shorter_than(minlen), filter_stopwords, tuple, tlz.juxt(sliding_window_c(1), sliding_window_c(2)), tlz.interleave, map_c(join_strings("_")))
def __calculate_confusion_tables(predictions_dict, confusion_table_generator, formatter): def calculate_confusion_table(label): return pmap({ label: confusion_table_generator(predictions_dict, label, formatter) }) return pipe( predictions_dict, iterkeys, map(calculate_confusion_table), merge, pmap )
def get_label_predictions(predictions_list, all_labels, label): def count_predictions(filtered_predictions_list, target_label): return pipe( filtered_predictions_list, filter(lambda (_, x): x == target_label), list, len ) filtered_predictions = pipe( predictions_list, filter(lambda (x, _): x == label) ) count_predictions_partial = \ partial(count_predictions, list(filtered_predictions)) return pipe( all_labels, map(lambda target: {target: count_predictions_partial(target)}), map(pmap), merge, pmap )