def get_8yrs_mg_and_ms( code: str, getter: Callable[[str], pd.DataFrame] = rdb.get_financial_indicator_by_code ) -> Optional[dict]: """ 获取上市公司8年(或多年)的毛利增长率的几何平均数,并以dict返回。 输入假设: code: 列表,例如,(-0.02, [('2011', -0.1523), ('2012', 0.2447), ..., ('2018', 0.1009)])。注意,可能为None 其中-0.02是增长率的几何平均数 输出规定: {'mg': -0.02, 'years_mgr': [('2011', -0.1523), ('2012', 0.2447), ..., ('2018', 0.1009)], 'ms': 2.87, 'years_mg': [('2011', 0.421819), ('2012', 0.466875), ..., ('2018', 0.206218)]} """ print(code) result = pipe( getter(_convert_to_ts_code(code)), _get_yrs_gm, _get_last_9_years_fi, _sort_years_fi, juxt(compose(_calc_gmean_8yrs_mg, _calc_8yrs_mg), compose(_calc_years_ms, _get_8_years_gm))) if result[0] is not None: return { 'mg': result[0][0], 'years_mgr': result[0][1], 'ms': result[1][0], 'years_mg': result[1][1] } else: return None
def main(): # URL to rss.xml's URLS = [] title_hashes = load_title_hashes() pipe( URLS, ( say("Fetching {} feeds...", len), fetch_feeds(feed_fetcher), parse_feeds, say("Found {} articles", len), remove_duplicates, say("Of those, {} are unique", len), hash_titles, remove_seen(have_seen(title_hashes)), say("and {} are new", len), say("Fetching article bodies...", id), fetch_bodies(body_fetcher), classify(NB_classifier()), sort, mark_as_seen(title_hashes), format_as_text, toolz.juxt(print, sendmail([])), ), )
def run(self, *queries: Neo4jAbstractQuery): results = juxt(*queries)(self.neo4j_client) if len(results) > 1: return results elif len(results) == 1: return results[0] else: return None
def main(command_fn): with CD("acg"): from main import AnkiCardGenApp app = AnkiCardGenApp() app.on_start = toolz.juxt(app.on_start, command_fn) app.run()
class Params(): coordinate = lens['coordinate'] coordinates = juxt([ coordinate['latitude'].get(), coordinate['longitude'].get(), compose(constrain(0, 1), float, lens.Get('tolerance', '0.5').get()), compose(constrain(10, 19), int, lens.Get('zoom', 16).get()) ]) image_url = lens['image_url'].get()
def check_lines(lines): errors = [ msg for boolean, msg in juxt( not_correct_order, not_enough_tags, )(lines) if boolean ] if errors: return False, '\n'.join(errors) return True, ''
def uni_and_bigram_tuples(string, minlen=3, maxlen=25): return tlz.pipe(string, lower, simple_split, filter_longer_than(maxlen), tlz.compose(tlz.concat, map_c(splitter_of_words)), filter_shorter_than(minlen), filter_stopwords, tuple, tlz.juxt(sliding_window_c(1), sliding_window_c(2)), tlz.interleave, map_c(join_strings("_")))
def export_intervals(chanjo_db, include_header=True, bed_score=0): r"""Return BED-formatted interval lines from existing ``chanjo_db``. BED lines are ready to be printed or written to a file. Args: chanjo_db (session): ``sqlalchemy.orm.session`` object with a ``.query``-method include_header (bool, optional): whether to include BED header bed_score (int, optional): dummy score (0-1000) to insert at field 5 to complete the BED format Yields: str: stringified and tab-delimited interval Examples: >>> from chanjo import export_intervals, Store ... # instantiate a new connection to a Chanjo database >>> db = Store('./coverage.sqlite3') >>> with open('intervals.sorted.bed', 'w') as stream: ... # write intervals in BED-format with appropriate headers ... for bed_line in export_intervals(db): ... stream.write(bed_line + '\n') """ if include_header: yield '#chrom\tchromStart\tchromEnd\tname\tscore\tstrand' # setup up which columns to fetch to make BED file # column 5 is just a silly default for the "score" field in BED i = Interval # alias columns = (i.contig, i.start - 1, i.end, i.id, i.strand) # BED files are tab-delimited delimiter = '\t' # 1. fetch interval tuples from the database (producer) # 2. stringify each item in each subsequence (interval tuple) # 3. join lines on tab-character # 4. prepend the header bed_lines = pipe( fetch_records(chanjo_db, columns), map(map(str)), # convert fields to strings map( juxt( compose(list, take(4)), # keep first 4 fields lambda _: [str(bed_score)], # insert BED score compose(list, last))), # keep last field map(concat), # flatten each item map(delimiter.join) # join on \t ) for bed_line in bed_lines: yield bed_line
def export_intervals(chanjo_db, include_header=True, bed_score=0): r"""Return BED-formatted interval lines from existing ``chanjo_db``. BED lines are ready to be printed or written to a file. Args: chanjo_db (session): ``sqlalchemy.orm.session`` object with a ``.query``-method include_header (bool, optional): whether to include BED header bed_score (int, optional): dummy score (0-1000) to insert at field 5 to complete the BED format Yields: str: stringified and tab-delimited interval Examples: >>> from chanjo import export_intervals, Store ... # instantiate a new connection to a Chanjo database >>> db = Store('./coverage.sqlite') >>> with open('intervals.sorted.bed', 'w') as stream: ... # write intervals in BED-format with appropriate headers ... for bed_line in export_intervals(db): ... stream.write(bed_line + '\n') """ if include_header: yield '#chrom\tchromStart\tchromEnd\tname\tscore\tstrand' # setup up which columns to fetch to make BED file # column 5 is just a silly default for the "score" field in BED i = Interval # alias columns = (i.contig, i.start - 1, i.end, i.id, i.strand) # BED files are tab-delimited delimiter = '\t' # 1. fetch interval tuples from the database (producer) # 2. stringify each item in each subsequence (interval tuple) # 3. join lines on tab-character # 4. prepend the header bed_lines = pipe( fetch_records(chanjo_db, columns), map(map(str)), # convert fields to strings map(juxt(compose(list, take(4)), # keep first 4 fields lambda _: [str(bed_score)], # insert BED score compose(list, last))), # keep last field map(concat), # flatten each item map(delimiter.join) # join on \t ) for bed_line in bed_lines: yield bed_line
def find_ranges(seq: Sequence[TS]) -> Sequence[Tuple[TS, TS]]: # consecutive dates have the same offset from their index ranges = pipe( seq, enumerate, # -> Iterator[Tuple[int, TS]] # groupby will include the last consequetive ts groupby(offset_between_index_and_ts ), # -> Dict[int, List[Tuple[int, TS]]] dict.values, # -> Iterator[List[Tuple[int, TS]]] map(map(last)), # -> Iterator[Iterator[TS]] map(list), # -> Iterator[List[TS]] # the range stop is the day after the last date in the group map(juxt(first, compose(add(pd.Timedelta(days=1)), last))), # -> Iterator[Tuple[Ts, Ts]] list, # -> List[Tuple[Ts,Ts]] ) return ranges
def run_filters(filters, inpath, outpath, file): logging.debug(f"Processing file {file}") os.makedirs(os.path.join(outpath, os.path.dirname(file)), exist_ok=True) with open(os.path.join(inpath, file), 'r') as fin: with open(os.path.join(outpath, file), 'w') as fout: for article_json in fin: article = json.loads(article_json) filter_results = juxt([f.filter for f in filters])(article) if all(filter_results): logging.info(f'Including article "{article["title"]}"') fout.writelines([article_json]) else: failed_filters = [ filters[i].name for i, filter_ok in enumerate(filter_results) if not filter_ok ] logging.info( f'Excluding article "{article["title"]}". Filters [{", ".join(failed_filters)}] ' f'failed')
def rowfunc(t): children = [optimize(child, []) for child in t.children] funcs = [rrowfunc(_child, t._child) for _child in children] return compose(concat_maybe_tuples, juxt(*funcs))
def linear_program(graph: nx.DiGraph, paths: Paths) -> Result: cost = edge_cost(graph.number_of_edges()) trips = list(set(map(trip, paths))) least_path_cost = min_path_cost(len(trips)) gamma = cp.Parameter(nonneg=True) gamma.value = 0.0 edge_index = {e: i for i, e in enumerate(graph.edges)} trip_index = {t: i for i, t in enumerate(trips)} edge_cost_of = get_by_node_pair(edge_index, cost) trip_cost_of = get_by_node_pair(trip_index, least_path_cost) out_edges_of_node = out_edges(successors(graph)) # Node -> Iterable[NodePair] min_trip_cost_from_path = compose( trip_cost_of, # NodePair -> cp.Variable trip # Path -> NodePair ) # Path -> cp.Variable compute_gaps = compose( starmap(path_cost_gap), map(juxt(min_trip_cost_from_path, path_cost(edge_cost_of)) ) ) # Iterable[Path] -> Iterable[float] out_edge_cost_is_normalized = compose( edge_cost_is_normalized(edge_cost_of), out_edges_of_node, ) # Node -> Constraint constraints = [] constraints.extend(map(out_edge_cost_is_normalized, graph.nodes)) constraints.extend(map(gap_is_optimal, compute_gaps(paths))) other_paths = concat(map(suboptimal_paths(graph.nodes, trip_indexed_paths(paths)), trips)) other_paths = list(other_paths) suboptimal_gaps = compute_gaps(other_paths) constraints.extend(map(gap_is_suboptimal, suboptimal_gaps)) out_edge_entropy = compose( edges_entropy(edge_cost_of), # Iterable[NodePair] -> cp.Expression out_edges_of_node, # Node -> Iterable[NodePair] ) # Node -> cp.Expression total_out_edge_entropy = compose( sum, # Iterable[cp.Expression] -> cp.Expression map(out_edge_entropy) # Iterable[Node] -> Iterable[cp.Expression] ) # Iterable[Node] -> cp.Expression objective = total_out_edge_entropy(graph.nodes) - cp.sum(least_path_cost) problem = cp.Problem( cp.Maximize(objective), constraints ) return Result( problem=problem, penalty=gamma, # TODO: remove unused edge_cost=cost, min_trip_cost=least_path_cost, edge_index=edge_index, trip_index=trip_index, discovered_paths=other_paths, )
def compute_chunk(self, graph, dates, assets, initial_workspace): """ Compute the Pipeline terms in the graph for the requested start and end dates. Parameters ---------- graph : zipline.pipeline.graph.TermGraph dates : pd.DatetimeIndex Row labels for our root mask. assets : pd.Int64Index Column labels for our root mask. initial_workspace : dict Map from term -> output. Must contain at least entry for `self._root_mask_term` whose shape is `(len(dates), len(assets))`, but may contain additional pre-computed terms for testing or optimization purposes. Returns ------- results : dict Dictionary mapping requested results to outputs. """ self._validate_compute_chunk_params(dates, assets, initial_workspace) get_loader = self.get_loader # Copy the supplied initial workspace so we don't mutate it in place. workspace = initial_workspace.copy() # If loadable terms share the same loader and extra_rows, load them all # together. loader_group_key = juxt(get_loader, getitem(graph.extra_rows)) loader_groups = groupby(loader_group_key, graph.loadable_terms) for term in graph.ordered(): # `term` may have been supplied in `initial_workspace`, and in the # future we may pre-compute loadable terms coming from the same # dataset. In either case, we will already have an entry for this # term, which we shouldn't re-compute. if term in workspace: continue # Asset labels are always the same, but date labels vary by how # many extra rows are needed. mask, mask_dates = self._mask_and_dates_for_term( term, workspace, graph, dates ) if isinstance(term, LoadableTerm): to_load = sorted( loader_groups[loader_group_key(term)], key=lambda t: t.dataset ) loader = get_loader(term) loaded = loader.load_adjusted_array( to_load, mask_dates, assets, mask, ) workspace.update(loaded) else: workspace[term] = term._compute( self._inputs_for_term(term, workspace, graph), mask_dates, assets, mask, ) assert(workspace[term].shape == mask.shape) out = {} graph_extra_rows = graph.extra_rows for name, term in iteritems(graph.outputs): # Truncate off extra rows from outputs. out[name] = workspace[term][graph_extra_rows[term]:] return out
) xpro_courses_etl = compose( loaders.load_courses, xpro.transform_courses, xpro.extract_courses ) mitx_etl = compose( loaders.load_courses, # take the first argument (the output of mitx.tranform) first, # duplicate the raw responses into two streams between our transformation code and the ocw/mitx manifest upload juxt( log_exceptions("Error tranforming MITx response", exc_return_value=[])( mitx.transform ), # for the sake of not touching OCW code, we've implementing this function here in discussions # it takes the concatenated raw results from MITx and uploads them as a json file to the OCW bucket # we'll probably do away with this at later date when we can easily move it into OCW log_exceptions("Error uploading MITx manifest to OCW")( ocw.upload_mitx_course_manifest ), ), log_exceptions("Error extracting MITx catalog", exc_return_value=[])(mitx.extract), ) oll_etl = compose(loaders.load_courses, oll.transform, oll.extract) see_etl = compose(loaders.load_courses, see.transform, see.extract) mitpe_etl = compose(loaders.load_courses, mitpe.transform, mitpe.extract) youtube_etl = compose(loaders.load_video_channels, youtube.transform, youtube.extract)
def closure(*args, **kwargs) -> tuple: m = toolz.compose(post, _tt_flatten, toolz.juxt([tuple_wrap(f) for f in funcs]), pre) return m(*args, **kwargs)
from toolz import groupby, juxt inc = lambda x: x + 1 double = lambda x: x * 2 exp = lambda x: x * x a = juxt(inc, double, exp)(10) b = juxt([inc, double, exp])(10) c = 0
from .models import WeatherUndergroundObservation, WeatherUndergroundObservationSchema # NOQA log = logging.getLogger(__name__) get_observations = compose( # extract observations from api extract_observations, # get observations from payload get(0), # drop the deserialization errors WeatherUndergroundAPIResponse().load, # deserialize api response query_api # query the api ) collect_data = compose( # create observation models from api response do(compose(log.info, "Created {} observations".format, len)), process_response, # create observations models fapply(map), # merge metadata into each observation juxt(process_metadata, get_observations) # query params as metadata ) def collect_many(api_key, on_dates, zipcodes, t): """Collect data over many dates and zipcodes :param api_key: str weather underground api key :param on_dates: list of dates :param zipcodes: list of zipcodes :param t: float delay between api calls :return: list of observations """ collect_one = curry(collect_data) process = compose(flatten, map_sleep(t, fapply(collect_one(api_key))), zip) return process(on_dates, zipcodes)
def compute_chunk(self, graph, dates, assets, initial_workspace): """ Compute the Pipeline terms in the graph for the requested start and end dates. Parameters ---------- graph : zipline.pipeline.graph.TermGraph dates : pd.DatetimeIndex Row labels for our root mask. assets : pd.Int64Index Column labels for our root mask. initial_workspace : dict Map from term -> output. Must contain at least entry for `self._root_mask_term` whose shape is `(len(dates), len(assets))`, but may contain additional pre-computed terms for testing or optimization purposes. Returns ------- results : dict Dictionary mapping requested results to outputs. """ self._validate_compute_chunk_params(dates, assets, initial_workspace) get_loader = self.get_loader # Copy the supplied initial workspace so we don't mutate it in place. workspace = initial_workspace.copy() # If loadable terms share the same loader and extra_rows, load them all # together. loader_group_key = juxt(get_loader, getitem(graph.extra_rows)) loader_groups = groupby(loader_group_key, graph.loadable_terms) for term in graph.ordered(): # `term` may have been supplied in `initial_workspace`, and in the # future we may pre-compute loadable terms coming from the same # dataset. In either case, we will already have an entry for this # term, which we shouldn't re-compute. if term in workspace: continue # Asset labels are always the same, but date labels vary by how # many extra rows are needed. mask, mask_dates = self._mask_and_dates_for_term( term, workspace, graph, dates) if isinstance(term, LoadableTerm): to_load = sorted(loader_groups[loader_group_key(term)], key=lambda t: t.dataset) loader = get_loader(term) loaded = loader.load_adjusted_array( to_load, mask_dates, assets, mask, ) workspace.update(loaded) else: workspace[term] = term._compute( self._inputs_for_term(term, workspace, graph), mask_dates, assets, mask, ) assert (workspace[term].shape == mask.shape) out = {} graph_extra_rows = graph.extra_rows for name, term in iteritems(graph.outputs): # Truncate off extra rows from outputs. out[name] = workspace[term][graph_extra_rows[term]:] return out
def filter_titles_regex_whitelist(whitelist_regexes, article): return any(juxt([x.match for x in whitelist_regexes])(article['title']))
ts = start_ts_parsed_from_raw_data(raw_data) # the date might need to be corrected return (ts - pd.Timedelta(days=(0 if ts.hour > 18 else 1))).round("D") ROOT_FIELDS = ["timeInBed", "minutesAsleep", "efficiency"] LEVELS_FIELDS = ["wake", "light", "deep", "rem"] HEADER = ["startTime"] + ROOT_FIELDS + LEVELS_FIELDS parsed_levels = [ compose(get("minutes"), level_getter, get("summary"), get("levels")) for level_getter in map(get(default={"minutes": 0}), LEVELS_FIELDS) ] parsed_row = juxt( start_date_parsed_from_raw_data, start_ts_parsed_from_raw_data, *map(get, ROOT_FIELDS), *parsed_levels, ) longest_sleep = curry(max)(key=get(2)) def parsed_data(raw_data: RawData) -> pd.DataFrame: parsed_rows: Iterator[Tuple] = map(parsed_row, raw_data) dates_sleeps: List[List[Tuple]] = groupby(first, parsed_rows).values() dates_longest_sleep: Iterator[Tuple] = map(longest_sleep, dates_sleeps) parsed_columns: List[Tuple] = list(zip(*dates_longest_sleep)) dates = pd.DatetimeIndex(parsed_columns[0], name="dates") body: List[Tuple] = list(zip(*parsed_columns[1:])) if len(body[0]) != len(HEADER): breakpoint() data = pd.DataFrame(body, columns=HEADER, index=dates)
def rowfunc(t): funcs = [rrowfunc(_child, t._child) for _child in t.children] return compose(concat_maybe_tuples, juxt(*funcs))
def compute_chunk(self, graph, dates, assets, initial_workspace): """ Compute the Pipeline terms in the graph for the requested start and end dates. Parameters ---------- graph : zipline.pipeline.graph.TermGraph dates : pd.DatetimeIndex Row labels for our root mask. assets : pd.Int64Index Column labels for our root mask. initial_workspace : dict Map from term -> output. Must contain at least entry for `self._root_mask_term` whose shape is `(len(dates), len(assets))`, but may contain additional pre-computed terms for testing or optimization purposes. Returns ------- results : dict Dictionary mapping requested results to outputs. """ self._validate_compute_chunk_params(dates, assets, initial_workspace) get_loader = self.get_loader # Copy the supplied initial workspace so we don't mutate it in place. workspace = initial_workspace.copy() refcounts = graph.initial_refcounts(workspace) execution_order = graph.execution_order(refcounts) # If loadable terms share the same loader and extra_rows, load them all # together. loadable_terms = graph.loadable_terms loader_group_key = juxt(get_loader, getitem(graph.extra_rows)) loader_groups = groupby( loader_group_key, # Only produce loader groups for the terms we expect to load. This # ensures that we can run pipelines for graphs where we don't have # a loader registered for an atomic term if all the dependencies of # that term were supplied in the initial workspace. (t for t in execution_order if t in loadable_terms), ) for term in graph.execution_order(refcounts): # `term` may have been supplied in `initial_workspace`, and in the # future we may pre-compute loadable terms coming from the same # dataset. In either case, we will already have an entry for this # term, which we shouldn't re-compute. if term in workspace: continue # Asset labels are always the same, but date labels vary by how # many extra rows are needed. mask, mask_dates = graph.mask_and_dates_for_term( term, self._root_mask_term, workspace, dates, ) if isinstance(term, LoadableTerm): to_load = sorted( loader_groups[loader_group_key(term)], key=lambda t: t.dataset ) loader = get_loader(term) loaded = loader.load_adjusted_array( to_load, mask_dates, assets, mask, ) assert set(loaded) == set(to_load), ( 'loader did not return an AdjustedArray for each column\n' 'expected: %r\n' 'got: %r' % (sorted(to_load), sorted(loaded)) ) workspace.update(loaded) else: workspace[term] = term._compute( self._inputs_for_term(term, workspace, graph), mask_dates, assets, mask, ) if term.ndim == 2: assert workspace[term].shape == mask.shape else: assert workspace[term].shape == (mask.shape[0], 1) # Decref dependencies of ``term``, and clear any terms whose # refcounts hit 0. for garbage_term in graph.decref_dependencies(term, refcounts): del workspace[garbage_term] out = {} graph_extra_rows = graph.extra_rows for name, term in iteritems(graph.outputs): # Truncate off extra rows from outputs. out[name] = workspace[term][graph_extra_rows[term]:] return out
def rowfunc(t): funcs = list(map(recursive_rowfunc, t.children)) return compose(concat_maybe_tuples, juxt(*funcs))