def succeeded(self, event): command = self.started_cmds[event.request_id] if not command: return self.started_cmds.pop(event.request_id) duration = event.duration_micros if self.is_below_lwm(duration): return [cmd, q, meta] = take(3, command.items()) self.render_cmd(cmd, duration, q) ents = pipe( traceback.extract_stack(), self.config.stack_preprocess, map(lambda rec: StackEntry(self.config.file_capture, *rec)), filter(lambda ent: ent.file_capture()), filter(lambda ent: len( list( filter(lambda p: re.match(p, ent.file, re.M), self.config. ignores))) == 0), groupby(lambda ent: ent.file), ) self.render_stack(ents)
def count_predictions(filtered_predictions_list, target_label): return pipe( filtered_predictions_list, filter(lambda (_, x): x == target_label), list, len )
def prune_hyperrect(self, rect: Hyperrectangle) -> Hyperrectangle: new_statements = set() training_examples_covered = pipe(self.examples, filter(lambda e: rect.covers(e)), list) for statement in rect.statements: covered_samples = pipe(training_examples_covered, filter(statement.covers_example), list) all_training_data_feature_values = pipe( covered_samples, map(lambda e: e.value_by_feature[statement.feature]), set, ) if statement.feature.kind == FeatureType.CATEGORICAL: new_statements.add( Statement( feature=statement.feature, categories=statement.categories.intersection( all_training_data_feature_values), )) if statement.feature.kind == FeatureType.REAL: lowest_boundary = min(all_training_data_feature_values) highest_boundary = max(all_training_data_feature_values) new_statements.add( Statement( feature=statement.feature, lower_boundary=lowest_boundary, upper_boundary=highest_boundary, )) return Hyperrectangle(statements=new_statements, label=rect.label)
def bound_if_needed( rule: Rule, statement: Statement, feature_min_values, feature_max_values ) -> Set[Statement]: statements = set(rule.get_statements_for_feature(statement.feature_idx)).difference( {statement} ) new_statements = set() next_higher = pipe( statements, filter(lambda s: s.threshold > statement.threshold), list ) next_lower = pipe( statements, filter(lambda s: s.threshold < statement.threshold), list ) if statement.relation == Relation.LEQ and not any(next_lower): new_statements.add( Statement( statement.feature_idx, Relation.MT, feature_min_values[statement.feature_idx], ) ) elif statement.relation == Relation.MT and not any(next_higher): new_statements.add( Statement( statement.feature_idx, Relation.LEQ, feature_max_values[statement.feature_idx] - EPS, ) ) else: return set() return new_statements
def main(): """Main method.""" # create a player named tuple Player = namedtuple('Player', ['first', 'last', 'number', 'team', 'city']) # create some player named tuples m_j = Player(first='Michael', last='Jordan', number='23', team='Bulls', city='Chicago') k_b = Player(first='Kobe', last='Bryant', number='24', team='Lakers', city='Los Angeles') l_b = Player(first='LeBron', last='James', number='23', team='Cavaliers', city='Cleveland') k_p = Player(first='Kristaps', last='Porzingis', number='6', team='Knicks', city='New York') k_d = Player(first='Kevin', last='Durant', number='35', team='Warriors', city='Oakland') # store the players in tuple players = (m_j, k_b, l_b, k_p, k_d) # filter two_three = filter(lambda x: x.number == '23', players) print(tuple(two_three)) # => (Player(first='Michael', last='Jordan', number='23', team='Bulls', # city='Chicago'), Player(first='LeBron', last='James', number='23', # team='Cavaliers', city='Cleveland')) # map result = map(lambda x: ''.join([x.first, ' ', x.last]), players) print(tuple(result)) # => ('Michael Jordan', 'Kobe Bryant', 'LeBron James', 'Kristaps Porzingis', 'Kevin Durant') # reduce result = reduce(lambda x, y: x + y, [1, 2, 3], 0) print(result) # => 6 # compose nums = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5] c_greater = curry(greater) greater_zero = c_greater(y_val=0) result = compose(filter(greater_zero), filter(iseven))(nums) print(tuple(result)) # => (2, 4)
def fetch(content, prefix): return { "parts": pipe( parse("$..layers").find(content), mapcat(lambda m: m.value), filter(lambda v: v["exportOptions"]["exportFormats"]), filter(lambda v: re.match(prefix, v["name"])), map(lambda v: glom( v, { "key": "name", "layout": ( "frame", { "left": ("x", round), "top": ("y", round), "width": ("width", round), "height": ("height", round), }, ), }, )), sorted(key=lambda p: p["key"]), list, ) }
def genome(file_pattern): if os.path.basename(file_pattern).split('.')[-1] == "gz": gzopen = tz.curry(gzip.open) result = tz.pipe(file_pattern, glob, sorted, c.map(gzopen(mode='rt')), tz.concat, c.filter(is_sequence), tz.concat, c.filter(is_nucleotide)) else: result = tz.pipe(file_pattern, glob, sorted, c.map(open), tz.concat, c.filter(is_sequence), tz.concat, c.filter(is_nucleotide)) return result
def get_sequence(path_to_files): """Stream a genome, letter by letter, from a list of FASTA filenames.""" return tz.pipe( path_to_files, cur.map(fasta_reader), tz.concat, cur.filter(is_sequence), # concatenate characters from all lines tz.concat, # discard newlines and 'N' cur.filter(is_nucleotide))
def genome_gz(file_pattern): """Stream a genome, letter by letter, from a list of FASTA filenames.""" return tz.pipe( file_pattern, glob, sorted, # Filenames cur.map(gzopen(mode='rt')), # lines # concatenate lines from all files: tz.concat, # drop header from each sequence cur.filter(is_sequence), # concatenate characters from all lines tz.concat, # discard newlines and 'N' cur.filter(is_nucleotide))
def get_tenant_metrics(tenant_id, scaling_groups, servers, _print=False): """ Produce per-group metrics for all the groups of a tenant :param list scaling_groups: Tenant's scaling groups as dict from CASS :param dict servers: Servers from Nova grouped based on scaling group ID. Expects only ACTIVE or BUILD servers :return: ``list`` of (tenantId, groupId, desired, actual) GroupMetrics """ if _print: print('processing tenant {} with groups {} and servers {}'.format( tenant_id, len(scaling_groups), len(servers))) metrics = [] for group in scaling_groups: group_id = group['groupId'] create_metrics = partial(GroupMetrics, tenant_id, group_id, group['desired']) if group_id not in servers: metrics.append(create_metrics(0, 0)) else: active = len(list(filter(lambda s: s['status'] == 'ACTIVE', servers[group_id]))) metrics.append( create_metrics(active, len(servers[group_id]) - active)) return metrics
def fancify_summary(expr): """ Separate a complex summary into two pieces Helps pandas compute_by on summaries >>> t = symbol('t', 'var * {x: int, y: int}') >>> one, two, three = fancify_summary(summary(a=t.x.sum(), b=t.x.sum() + t.y.count() - 1)) A simpler summary with only raw reductions >>> one summary(x_sum=sum(t.x), y_count=count(t.y)) A mapping of those names to new leaves to use in another compuation >>> two # doctest: +SKIP {'x_sum': x_sum, 'y_count': y_count} A mapping of computations to do for each column >>> three # doctest: +SKIP {'a': x_sum, 'b': (x_sum + y_count) - 1} In this way, ``compute_by`` is able to do simple pandas reductions using groups.agg(...) and then do columnwise arithmetic afterwards. """ seen_names.clear() name_dict.clear() exprs = pipe(expr.values, map(Expr._traverse), concat, filter(lambda x: isinstance(x, Reduction)), set) one = summary(**dict((_name(expr), expr) for expr in exprs)) two = dict((_name(expr), symbol(_name(expr), datashape.var * expr.dshape)) for expr in exprs) d = dict((expr, two[_name(expr)]) for expr in exprs) three = dict((name, value._subs(d)) for name, value in zip(expr.names, expr.values)) return one, two, three
def get_char_to_lu_phones() -> Dict[str, List[str]]: char_to_phones = pipe(CharPhoneTable.select(), map(lambda e: (e.char, e.lu)), filter(lambda e: e[0] != '' and e[1] != ''), groupby(lambda e: e[0]), valmap(lambda phones: [e[1] for e in phones]), dict) return char_to_phones
def assign_link(course: Course, course_root: str, path: str, link_text: str = None, *, force_upload: bool = False, dry_run: bool = False): path = resolve_path(course_root, path) meta = maybe_markdown_from_path(path).meta if not meta or 'name' not in meta: log.error(f'Cannot read assignment name from {path}') return link_text or path.name name = meta['name'] html_url = pipe( canvas.assignment.assignments(course), filter(lambda a: a.data['name'] == name), maybe_first, lambda a: a.data['html_url'], ) link_text = link_text or name if html_url: return (f'<a title="{name}" href="{html_url}">{link_text}</a>') log.error(f'Could not retrieve Canvas assignment with name: {name}') return link_text
def generalise(self, example): closest_hyperrectangle_with_distance = self.find_closest(example) (closest_hyperrectangle, distance) = closest_hyperrectangle_with_distance hyperrectangle_candidate = self.extend(closest_hyperrectangle, example) covers_conlicting = pipe( self.examples, filter(lambda e: hyperrectangle_candidate.covers(e)), filter(lambda e: e.label != hyperrectangle_candidate.label), any, ) if covers_conlicting: self.add_as_new_hyperrectangle(example) else: self.hyperrectangles.remove(closest_hyperrectangle) self.hyperrectangles.add(hyperrectangle_candidate)
def get_char_to_xhe_shapes() -> Dict[str, List[str]]: char_to_shape = pipe(CharHeShapeTable.select(), map(lambda e: (e.char, e.shapes)), filter(lambda e: e[0] != '' and e[1] != ''), groupby(lambda e: e[0]), valmap(lambda e: [s[1] for s in e]), dict) return char_to_shape
def ccds_to_bed(ccds_stream): """Convert CCDS dump to Chanjo-style BED stream. Main entry point for default Chanjo converter (ccds). It converts a sorted (start, chrom) CCDS database to the Chanjo BED-format. Args: ccds_stream (file): file handle to read CCDS lines from Yields: Interval: interval with merged block and superblock ids """ return pipe( ccds_stream, filter(grep('Public')), # filter out Public tx map(text_type.rstrip), # strip \n and spaces map(split(sep='\t')), # split into list map(extract_intervals), # convert to Interval concat, # flatten map(rename_sex_interval), # rename sex contigs partial(lazy_groupby, key=attrgetter('contig')), # group by contig pluck(1), # extract second item map(groupby(attrgetter('name'))), # non-lazy group by id map(valmap(merge_related_elements)), # group intervals map(itervalues), # extract values map(partial(sorted, key=attrgetter('start'))), # sort by start pos concat # flatten )
def get_scaling_group_servers(tenant_id, group_id, now, all_as_servers=get_all_scaling_group_servers, all_servers=get_all_server_details, cache_class=CassScalingGroupServersCache): """ Get a group's servers taken from cache if it exists. Updates cache if it is empty from newly fetched servers # NOTE: This function takes tenant_id even though the whole effect is # scoped on the tenant because cache calls require tenant_id. Should # they also not take tenant_id and work on the scope? :return: Servers as list of dicts :rtype: Effect """ cache = cache_class(tenant_id, group_id) cached_servers, last_update = yield cache.get_servers(False) if last_update is None: servers = (yield all_as_servers()).get(group_id, []) else: current = yield all_servers() servers = mark_deleted_servers(cached_servers, current) servers = list(filter(server_of_group(group_id), servers)) yield do_return(servers)
def test_pipeline_example(): from functools import reduce import operator as op data = range(100) result1 = math.sqrt( reduce( op.add, builtins.map(lambda x: x**2.0, builtins.filter( lambda x: x % 2 == 0, data, )))) from toolz.curried import filter, map, reduce from flowly.tz import chained transform = chained( filter(lambda x: x % 2 == 0), map(lambda x: x**2.0), reduce(op.add), math.sqrt, ) result2 = transform(data) assert result1 == result2
def filter_courses(course_iter: Iterable[Course], **search_kw) -> Tuple[Course]: return _.pipe( course_iter, _.filter(lambda c: has_metadata(c, **search_kw)), tuple, )
def __call__(self, *node_ids): if len(node_ids) == 1 and __.is_seq(node_ids[0]): node_ids = tuple(node_ids[0]) bad_ids = _.pipe( node_ids, _.filter(lambda i: i not in self), tuple, ) if bad_ids: bad_str = _.pipe(bad_ids, _.map(str), ', '.join) log.error(f'Bad node ids: {bad_str}') # log.info(type(node_ids[0])) node_ids = set(node_ids) - set(bad_ids) # log.info(node_ids) if not node_ids: log.error('No ids left') return GraphNull return _.pipe( node_ids, _.map(lambda n: (n, self[n])), self.graph_pipe(self), )
def course_from_path(api: rest.Api, path: (str, Path)) -> rest.Endpoint: '''Given a path, recursively search for a course.yml file, get the Canvas ID from it and find the course endpoint with that ID If more than either more than one or no course.yml files are found, will log error and return None. ''' course_path = find_one_course_path(path) if course_path: course_data = yaml.read_yaml(course_path) if 'id' not in course_data: log.error( 'Could not find course ID in course.yml file' ) return None course_ep = _.pipe( get_courses(api()), _.filter(lambda c: c.data['id'] == course_data['id']), lcommon.maybe_first, ) if not course_ep: log.error( 'Could not find course in Canvas for course data:\n' f'{pprint.pformat(_.dissoc(course_data, "students"))}' ) return None return course_ep
def filter_data(field, yaml_data): """Extract a field of data from the YAML files. Args: field: the name of the field to extract yaml_data: the benchmark YAML data Returns: the filtered data from the YAML data """ return pipe( yaml_data, dict, valmap(lambda val: val["data"]), valmap(filter(lambda item: item["name"].lower() == field)), valmap(list), valmap(get(0, default=None)), valfilter(lambda x: x is not None), itemmap(lambda item: (item[0], update_dict(item[1], name=item[0]))), lambda dict_: sorted(list(dict_.values()), key=lambda item: item["name"]), map( update_in( keys=["transform"], func=lambda x: x + [dict(expr="datum.x > 0.01", type="filter")], ) ), )
def rolling_fit_opt_weights(df, opt_weights_func, look_ahead_per): """applies opt_weights_func to rolling window on pandas df""" num_rows = df.shape[0] p = pipe(xrange(num_rows), filter(lambda x: x + look_ahead_per < num_rows), map(lambda x: {df.index[x]: opt_weights_func(df.iloc[x:x+look_ahead_per+1])})) return pd.DataFrame(merge(p)).T
def cols_to_word_phone_table(cols: List[str], xhe_transformer, zrm_transformer) -> WordPhoneTable: if len(cols) == 1: word = cols[0] priority = 1 full = get_full(word) elif len(cols) == 2: word = cols[0] priority = cols[1] full = get_full(word) elif len(cols) == 2 + len(cols[0]): word = cols[0] priority = cols[1] full = list(filter(lambda e: len(e) > 0, [e.strip() for e in cols[2:]])) else: raise RuntimeError("word item should be: 你好 [priority n i h ao]") return WordPhoneTable( word=word, full=''.join(full), xhe=''.join([full_to_two(e, xhe_transformer) for e in full]), zrm=''.join([full_to_two(e, zrm_transformer) for e in full]), lu="", priority=priority, updatedt=datetime.now() )
def compute_down(expr, data, **kwargs): """ Compile a blaze expression to a sparksql expression""" leaves = expr._leaves() # make sure we only have a single leaf node if len(leaves) != 1: raise ValueError('Must compile from exactly one root database') leaf, = leaves # field expressions on the database are Field instances with a record # measure whose immediate child is the database leaf tables = pipe(expr._subterms(), filter(istable(leaf)), list) # raise if we don't have tables in our database if not tables: raise ValueError('Expressions not referencing a table cannot be ' 'compiled') # make new symbols for each table new_leaves = [symbol(t._name, t.dshape) for t in tables] # sub them in the expression expr = expr._subs(dict(zip(tables, new_leaves))) # compute using sqlalchemy scope = dict(zip(new_leaves, map(make_sqlalchemy_table, tables))) query = compute(expr, scope) # interpolate params compiled = literalquery(query, dialect=HiveDialect()) return data.sql(str(compiled))
def course_files_matching_path(course: IdResourceEndpoint, path: str): path = Path(path).expanduser().resolve() return pipe( files(course), filter(lambda f: f.data['filename'] == path.name), tuple, )
def get_message(log): activity = log.get("activity") if activity == "Operation": on_load = log.get("on_load_no_of_packages") off_load = log.get("off_load_no_of_packages") msg = ( " and ".join( filter( None, [ on_load and "Loaded {} packages".format(on_load), off_load and "Unloaded {} packages".format(off_load), ], ) ) or "Operation" ) return "{} at {}".format(msg, log.get("station"),) if activity == "Stopped": return "Stopped at {}".format(log.get("station")) if activity == "Moving": return "Moving to {}".format(log.get("station")) return activity
def of_type(self, type): return _.pipe( self, _.filter(lambda n: self.nodes[n]['type'] == type), _.map(lambda n: (n, self[n])), self.graph_pipe(self), )
def compute_down(expr, data, **kwargs): """ Compile a blaze expression to a sparksql expression""" leaves = expr._leaves() # make sure we only have a single leaf node if len(leaves) != 1: raise ValueError('Must compile from exactly one root database') leaf, = leaves # field expressions on the database are Field instances with a record # measure whose immediate child is the database leaf tables = pipe(expr._subterms(), filter(istable(leaf)), list) # raise if we don't have tables in our database if not tables: raise ValueError('Expressions not referencing a table cannot be ' 'compiled') # make new symbols for each table new_leaves = [symbol(t._name, t.dshape) for t in tables] # sub them in the expression expr = expr._subs(dict(zip(tables, new_leaves))) # compute using sqlalchemy scope = dict(zip(new_leaves, map(make_sqlalchemy_table, tables))) query = compute(expr, scope, return_type='native') # interpolate params compiled = literalquery(query, dialect=HiveDialect()) return data.sql(str(compiled))
def test_pipeline_example(): from functools import reduce import operator as op data = range(100) result1 = math.sqrt( reduce( op.add, builtins.map( lambda x: x ** 2.0, builtins.filter( lambda x: x % 2 == 0, data, ) ) ) ) from toolz.curried import filter, map, reduce from flowly.tz import chained transform = chained( filter(lambda x: x % 2 == 0), map(lambda x: x ** 2.0), reduce(op.add), math.sqrt, ) result2 = transform(data) assert result1 == result2
def gene_length_df(filename): """Grab Gene Symbol, Gene ID, and Gene Length from a GAF file. Parameters ---------- filename : string Path to a Gene Annotation Format (GAF) file. Returns ------- gene_lengths : pandas DataFrame A data frame with three columns: gene symbol, gene id, and gene length (in bases). """ with open(filename) as fin: header = next(fin).rstrip().split('\t') geneid = header.index('FeatureID') genelen = header.index('FeatureCoordinates') feattype = header.index('FeatureType') output = tz.pipe(fin, spliteach, tz.filter(lambda x: x[feattype] == 'gene'), tz.pluck([geneid, genelen]), tz.map(range2len), list) df = pd.DataFrame(output, columns=['GeneSymbol', 'GeneID', 'GeneLength']) df = df.drop_duplicates('GeneSymbol').set_index('GeneSymbol') return df
def process_directory(directory, output_filename='traces.csv'): """Extract traces and ROIs for all .da files in a directory. Parameters ---------- directory : string The directory containing the .da files to be processed. output_filename : string The name of the file to write the results to. """ filenames = tz.pipe(directory, os.listdir, C.filter(X.call('endswith', '.da')), sorted) filenames = [os.path.join(directory, fn) for fn in filenames] images, frame_intervals, bncs, dark_frames = unzip( map(read_image, filenames)) traces, rois = unzip(map(extract_trace, images)) with open(output_filename, 'w') as fout: for filename, frame_interval, trace, roi in \ zip(filenames, frame_intervals, traces, rois): line = ','.join([os.path.basename(filename), str(frame_interval)] + list(map(str, trace))) fout.write(line + '\n') io.imsave(filename[:-3] + '.roi.tif', roi.astype(np.uint8) * 255, plugin='tifffile', compress=1)
def link_next(response): return maybe_pipe( requests.utils.parse_header_links(response.headers.get('Link', '')), filter(lambda d: d.get('rel', '').lower() == 'next'), maybe_first, lambda d: d['url'], )
def composer(self, tokens): return compose(*pipe( tokens, reversed, filter(first), map( lambda arg: partial(arg[0], *arg[1], **arg[2]) if any(arg[1:]) else arg[0] ), list ))
def get_groups(parsed, store, conf): """ Return groups based on argument provided :param Namespace parsed: arguments parsed :param store: Otter scaling group collection :param dict conf: config :return: Deferred fired with list of {"tenantId": .., "groupId": ..} dict """ log = mock_log() if parsed.group: groups = [g.split(":") for g in parsed.group] return succeed( [{"tenantId": tid, "groupId": gid} for tid, gid in groups]) elif parsed.all: d = store.get_all_valid_groups() elif parsed.tenant_id: d = get_groups_of_tenants(log, store, parsed.tenant_id) elif parsed.disabled_tenants: non_conv_tenants = conf["non-convergence-tenants"] d = store.get_all_valid_groups() d.addCallback( filter(lambda g: g["tenantId"] not in set(non_conv_tenants))) d.addCallback(list) elif parsed.conf_conv_tenants: d = get_groups_of_tenants(log, store, conf["convergence-tenants"]) else: raise SystemExit("Unexpected group selection") return d
def ngram_counts(tokens, min_len=1, max_len=None, transform=" ".join, in_vocabulary=lambda _: True): """ Compute n-gram counts using toolz and Counter :param tokens: Iterable[str] :param min_len: int Minimum N-Gram size :param max_len: int Maximum N-Gram size :param transfrom: Callable[[Tuple[str, ...], str]] Function transforming ngram tuple into key :param in_vocabulary: Callable[[str], bool] Should token be preserved :return: Dict[str, int] """ tokens = list(tokens) wc = len(tokens) max_len = (max_len if max_len else wc) + 1 return ( wc, pipe( everygrams(tokens, min_len=min_len, max_len=max_len), map(transform), filter(in_vocabulary), frequencies, ), )
def page_link(course: Course, course_root: str, path: str, link_text: str = None, ref: str = None, *, force_upload: bool = False, dry_run: bool = False): ref = f'#{ref}' if ref else '' path = resolve_path(course_root, path) meta = maybe_markdown_from_path(path).meta if not meta or 'title' not in meta: log.error(f'Cannot read page title from {path}') return link_text or path.name title = meta['title'] html_url = pipe( canvas.page.pages(course), filter(lambda p: p.data['title'] == title), maybe_first, lambda p: p.data['html_url'], ) link_text = link_text or title if html_url: return (f'<a title="{title}" href="{html_url}{ref}">{link_text}</a>') log.error(f'Could not retrieve Canvas page with title: {title}') return link_text
def get_groups_to_converge(config_func): """ Get all tenant's all groups that needs convergence triggering """ eff = Effect(GetAllValidGroups()) eff = eff.on( filter(lambda g: tenant_is_enabled(g["tenantId"], config_func))) return eff.on(list)
def load_tgvH_file(): json_file_name = 'tgvH.json' nodes = read_ast_json_file(json_file_name) variables_definitions = filter( lambda node: node['type'].startswith('variable_'), nodes, ) return variables_definitions
def __dir__(self): result = dir(type(self)) if isrecord(self.dshape.measure) or isinstance(self.dshape.measure, datashape.Map) and self.fields: result.extend(map(valid_identifier, self.fields)) result.extend(toolz.merge(schema_methods(self.dshape.measure), dshape_methods(self.dshape))) return sorted(set(filter(isvalid_identifier, result)))
def functional(): return count_by(itemgetter('hour'), map(json.loads, filter(None, mapcat(lambda output: output.strip().split('\n'), map(lambda date: logs[date.strftime('%Y/%m/%d')], map(lambda days_ago: today - timedelta(days=days_ago), range(1, days_of_logs + 1)))))))
def get_clb_contents(): """ Get Rackspace Cloud Load Balancer contents as list of `CLBNode`. CLB health monitor information is also returned as a pmap of :obj:`CLB` objects mapped on LB ID. :return: Effect of (``list`` of :obj:`CLBNode`, `pmap` of :obj:`CLB`) :rtype: :obj:`Effect` """ # If we get a CLBNotFoundError while fetching feeds, we should throw away # all nodes related to that load balancer, because we don't want to act on # data that we know is invalid/outdated (for example, if we can't fetch a # feed because CLB was deleted, we don't want to say that we have a node in # DRAINING with draining time of 0; we should just say that the node is # gone). def gone(r): return catch(CLBNotFoundError, lambda exc: r) lb_ids = [lb['id'] for lb in (yield _retry(get_clbs()))] node_reqs = [_retry(get_clb_nodes(lb_id).on(error=gone([]))) for lb_id in lb_ids] healthmon_reqs = [ _retry(get_clb_health_monitor(lb_id).on(error=gone(None))) for lb_id in lb_ids] all_nodes_hms = yield parallel(node_reqs + healthmon_reqs) all_nodes, hms = all_nodes_hms[:len(lb_ids)], all_nodes_hms[len(lb_ids):] lb_nodes = { lb_id: [CLBNode.from_node_json(lb_id, node) for node in nodes] for lb_id, nodes in zip(lb_ids, all_nodes)} clbs = { str(lb_id): CLB(bool(health_mon)) for lb_id, health_mon in zip(lb_ids, hms) if health_mon is not None} draining = [n for n in concat(lb_nodes.values()) if n.description.condition == CLBNodeCondition.DRAINING] feeds = yield parallel( [_retry(get_clb_node_feed(n.description.lb_id, n.node_id).on( error=gone(None))) for n in draining] ) nodes_to_feeds = dict(zip(draining, feeds)) deleted_lbs = set([ node.description.lb_id for (node, feed) in nodes_to_feeds.items() if feed is None]) def update_drained_at(node): feed = nodes_to_feeds.get(node) if node.description.lb_id in deleted_lbs: return None if feed is not None: node.drained_at = extract_clb_drained_at(feed) return node nodes = map(update_drained_at, concat(lb_nodes.values())) yield do_return(( list(filter(bool, nodes)), pmap(keyfilter(lambda k: k not in deleted_lbs, clbs))))
def piped(): return (_| range(1, days_of_logs + 1) | map(lambda days_ago: today - timedelta(days=days_ago)) | map(lambda date: logs[date.strftime('%Y/%m/%d')]) | mapcat(lambda output: output.strip().split('\n')) | filter(None) | map(json.loads) | count_by(itemgetter('hour')) |_)
def visit_ternary_operator(node): return pipe([ visit_node(node['value_if_true']), visit_node(node['condition']), visit_node(node['value_if_false']) if 'value_if_false' in node else None, ], filter(None), concat, )
def discover_jsonlines(j, n=10, encoding='utf-8', **kwargs): with json_lines(j.path, encoding=encoding) as lines: data = pipe(lines, filter(nonempty), map(json.loads), take(n), list) if len(data) < n: ds = discover(data) else: ds = var * discover(data).subshape[0] return date_to_datetime_dshape(ds)
def __dir__(self): result = dir(type(self)) if isrecord(self.dshape.measure) and self.fields: result.extend(list(map(valid_identifier, self.fields))) d = toolz.merge(schema_methods(self.dshape.measure), dshape_methods(self.dshape)) result.extend(list(d)) return sorted(set(filter(isvalid_identifier, result)))
def get_label_predictions(predictions_list, all_labels, label): def count_predictions(filtered_predictions_list, target_label): return pipe( filtered_predictions_list, filter(lambda (_, x): x == target_label), list, len ) filtered_predictions = pipe( predictions_list, filter(lambda (x, _): x == label) ) count_predictions_partial = \ partial(count_predictions, list(filtered_predictions)) return pipe( all_labels, map(lambda target: {target: count_predictions_partial(target)}), map(pmap), merge, pmap )
def parse_violations(do_request): """""" logger.info('Parsing violations') return toolz.compose( # filter out meaningless values curried.filter(lambda x: x not in ('IME PREDPISA', '')), # extract data from each row curried.map(lambda tr: pq(tr).find('td').eq(1).text()), # get all rows in tables curried.mapcat(lambda page: page('table.MsoNormalTable tr')), # get all subpages curried.map(do_request), # let's skip empty urls/strings curried.filter(lambda a: a), # get menu links curried.map(lambda a: pq(a).attr('href')), # get menu elements lambda doc: doc('.moduletable_menu a'), # get main page do_request, )(VIOLATION_URL + '/index.php')
def process(text): """ Replace failures in docstring with results """ parts = pipe(text, parser.parse, filter(None), map(separate_fence), concat, list) scope = dict() # scope of variables in our executed environment state = dict() # state of pymarkdown traversal out_parts = list() for part in parts: out, scope, state = step(part, scope, state) out_parts.extend(out) head = '\n'.join(sorted(state.get('headers', set()))) body = pipe(out_parts, map(render_part), filter(None), '\n'.join) foot = '\n\n'.join(state.get('footers', [])) return '\n\n'.join([head, body, foot]).strip()
def get_scaling_group_servers(tenant_id, authenticator, service_name, region, server_predicate=None, clock=None): """ Return tenant's servers that belong to a scaling group as {group_id: [server1, server2]} ``dict``. No specific ordering is guaranteed :param server_predicate: `callable` taking single server as arg and returns True if the server should be included, False otherwise """ def has_group_id(s): return 'metadata' in s and 'rax:auto_scaling_group_id' in s['metadata'] def group_id(s): return s['metadata']['rax:auto_scaling_group_id'] server_predicate = server_predicate if server_predicate is not None else lambda s: s servers_apply = compose(groupby(group_id), filter(server_predicate), filter(has_group_id)) d = get_all_server_details(tenant_id, authenticator, service_name, region, clock=clock) d.addCallback(servers_apply) return d
def tokenize_and_filter_perc_func(allowed_parts_of_speech, given_text): """Return the tokens in the given text that are the allowed parts of speech This version uses the faster PerceptronTagger""" return tz.pipe( given_text, lambda x: TextBlob(x, pos_tagger=PerceptronTagger()), lambda x: x.tags, tz.filter(lambda x: x[1] in allowed_parts_of_speech), # limit to allowed parts of speech tz.map(lambda x: x[0]), # return only the token list, )
def tokenize_and_filter_nltk_func(allowed_parts_of_speech, given_text): """Return the tokens in the given text that are the allowed parts of speech This version uses the recommended tagger from NLTK, it is relatively slow.""" return tz.pipe( given_text, nltk.word_tokenize, lambda x: nltk.pos_tag(x), tz.filter(lambda x: x[1] in allowed_parts_of_speech), # limit to allowed parts of speech tz.map(lambda x: x[0]), # return only the token list, print_and_pass)
def next_page(page): try: return next(page.execute_page_transition_yield( lambda x: or_pipe(x, _.find_elements_by_class_name("CMpaginate"), _.find_elements_by_class_name("a-last"), default=[] ), _[0], _.find_elements_by_tag_name("a"), filter(lambda x: u"次" in x.text), list, )(_.click())) except Exception: return None
def get_posterior_probs_freq(num_words, all_streams_count_dict, this_stream_count_dict): """Return the posterior probabilities for the num_words most frequent tokens in this_stream_count_dict""" occurance_minimum = 5 # the number of times a token must occur to be included return tz.pipe( get_top_tokens(num_words, this_stream_count_dict), tz.filter(lambda x: all_streams_count_dict[x[0]] >= occurance_minimum), tz.map(lambda x: { 'token': x[0], 'occurrences': x[1], 'posterior': calculate_posterior( all_streams_count_dict, this_stream_count_dict, x[0])}), lambda x: sorted(x, key=lambda y: -y['posterior']))
def get_all_scaling_group_servers(changes_since=None, server_predicate=identity): """ Return tenant's servers that belong to any scaling group as {group_id: [server1, server2]} ``dict``. No specific ordering is guaranteed :param datetime changes_since: Get server since this time. Must be UTC :param server_predicate: function of server -> bool that determines whether the server should be included in the result. :return: dict mapping group IDs to lists of Nova servers. """ def has_group_id(s): return 'metadata' in s and isinstance(s['metadata'], dict) def group_id(s): return group_id_from_metadata(s['metadata']) servers_apply = compose(keyfilter(lambda k: k is not None), groupby(group_id), filter(server_predicate), filter(has_group_id)) return get_all_server_details(changes_since).on(servers_apply)
def get_all_metrics(dispatcher, tenanted_groups, log, _print=False, get_all_metrics_effects=get_all_metrics_effects): """ Gather server data and produce metrics for all groups across all tenants in a region. :param dispatcher: An Effect dispatcher. :param dict tenanted_groups: Scaling Groups grouped on tenantid :param bool _print: Should the function print while processing? :return: ``list`` of `GroupMetrics` as `Deferred` """ effs = get_all_metrics_effects(tenanted_groups, log, _print=_print) d = _perform_limited_effects(dispatcher, effs, 10) d.addCallback(filter(lambda x: x is not None)) return d.addCallback(lambda x: reduce(operator.add, x, []))
def get_by_uuid(uuid, path='.'): """Get a Treant by short ID Args: uuid: a portion of the uuid path: the search path for Treants Returns: a Treant """ return pipe( path, dtr.discover, list, filter(lambda x: uuid in x.uuid), list, get(0, default=None) )
def annotate_bed_stream(bed_stream, bam_path, cutoff=10, extension=0, contig_prefix='', bp_threshold=17000): """Annotate all intervals from a BED-file stream. Yields tuple data for each interval with calculated coverage and completeness. Args: bed_stream (sequence): usually a BED-file handle to read from bam_path (str): path to BAM-file cutoff (int, optional): threshold for completeness calculation, defaults to 10 extension (int, optional): number of bases to extend each interval with (+/-), defaults to 0 contig_prefix (str, optional): rename contigs by prefixing, defaults to empty string bp_threshold (int, optional): optimization threshold for reading BAM-file in chunks, default to 17000 Yields: tuple: :class:`chanjo.BaseInterval`, coverage (float), and completeness (float) """ # setup: connect to BAM-file bam = BamFile(bam_path) # the pipeline return pipe( bed_stream, filter(complement(comment_sniffer)), # filter out comments map(text_type.rstrip), # strip invisble chars. map(prefix(contig_prefix)), # prefix to contig map(split(sep='\t')), # split lines map(do(validate_bed_format)), # check correct format map(lambda row: bed_to_interval(*row)), # convert to objects map(extend_interval(extension=extension)), # extend intervals group_intervals(bp_threshold=bp_threshold), # group by threshold map(process_interval_group(bam)), # read coverage concat, # flatten list of lists map(calculate_metrics(threshold=cutoff)) # calculate cov./compl. )
def get_clb_contents(): """Get Rackspace Cloud Load Balancer contents as list of `CLBNode`.""" # If we get a CLBNotFoundError while fetching feeds, we should throw away # all nodes related to that load balancer, because we don't want to act on # data that we know is invalid/outdated (for example, if we can't fetch a # feed because CLB was deleted, we don't want to say that we have a node in # DRAINING with draining time of 0; we should just say that the node is # gone). def gone(r): return catch(CLBNotFoundError, lambda exc: r) lb_ids = [lb['id'] for lb in (yield _retry(get_clbs()))] node_reqs = [_retry(get_clb_nodes(lb_id).on(error=gone([]))) for lb_id in lb_ids] all_nodes = yield parallel(node_reqs) lb_nodes = {lb_id: [CLBNode.from_node_json(lb_id, node) for node in nodes] for lb_id, nodes in zip(lb_ids, all_nodes)} draining = [n for n in concat(lb_nodes.values()) if n.description.condition == CLBNodeCondition.DRAINING] feeds = yield parallel( [_retry(get_clb_node_feed(n.description.lb_id, n.node_id).on( error=gone(None))) for n in draining] ) nodes_to_feeds = dict(zip(draining, feeds)) deleted_lbs = set([ node.description.lb_id for (node, feed) in nodes_to_feeds.items() if feed is None]) def update_drained_at(node): feed = nodes_to_feeds.get(node) if node.description.lb_id in deleted_lbs: return None if feed is not None: return assoc_obj(node, drained_at=extract_CLB_drained_at(feed)) else: return node nodes = map(update_drained_at, concat(lb_nodes.values())) yield do_return(list(filter(bool, nodes)))