def _can_merge(self, graph: nx.Graph, pair: tuple[Node, Node]) -> bool: succ = self.tree.neighbors nodes = self.tree.nodes stack, visited = [pair], set() while stack: # DFS for inconsistency in states. left, right = stack.pop() if (left, right) in visited: continue visited.add((left, right)) if (left, right) in graph.edges: return False # Reached known distinguished nodes. left_lbl = nodes[left].get('label') right_lbl = nodes[right].get('label') if None not in {left_lbl, right_lbl} and left_lbl != right_lbl: return False # Discovered distiguishing path. # Group neighbors by access token. succ_left = {nodes[n]['source']: n for n in succ(left)} succ_right = {nodes[n]['source']: n for n in succ(right)} merged = list(fn.merge_with(set, succ_left, succ_right).values()) # Interchange pair[0] and pair[1] is applicable. for p1, p2 in [pair, pair[::-1]]: merged.extend([(p | {p1}) - {p2} for p in merged if p2 in p]) # Add un-reconciled successors to stack. stack.extend([p for p in merged if len(p) == 2]) return True
def _extract_data(topic_model, corpus, dictionary): doc_lengths = [sum([t[1] for t in doc]) for doc in corpus] term_freqs_dict = fp.merge_with(sum, *corpus) vocab = list(dictionary.token2id.keys()) # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm.. # for now, I'll just make sure we don't ever get zeros... beta = 0.01 term_freqs = [term_freqs_dict.get(tid, beta) for tid in dictionary.token2id.values()] gamma, _ = topic_model.inference(corpus) doc_topic_dists = _normalize(gamma) topics = topic_model.show_topics(formatted=False, num_words=len(vocab), num_topics=topic_model.num_topics) topics_df = pd.DataFrame([dict((y, x) for x, y in tuples) for tuples in topics])[vocab] topic_term_dists = topics_df.values return { "topic_term_dists": topic_term_dists, "doc_topic_dists": doc_topic_dists, "doc_lengths": doc_lengths, "vocab": vocab, "term_frequency": term_freqs, }
def _extract_data(topic_model, corpus, dictionary): doc_lengths = [sum([t[1] for t in doc]) for doc in corpus] term_freqs_dict = fp.merge_with(sum, *corpus) vocab = [dictionary[id] for id in term_freqs_dict.keys()] term_freqs = term_freqs_dict.values() if str(type(topic_model)) == "<class 'gensim.models.ldamodel.LdaModel'>": gamma, _ = topic_model.inference(corpus) doc_topic_dists = _normalize(gamma) topic_term_dists = _normalize(topic_model.state.get_lambda()) elif str(type(topic_model)) == "<class 'gensim.models.hdpmodel.HdpModel'>": gamma = topic_model.inference(corpus) doc_topic_dists = _normalize(gamma) topic_model.update_expectations() topic_term_dists = _normalize(topic_model.m_lambda) else: raise("Unknown gensim model") return { 'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists, 'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs }
def _extract_data(topic_model, corpus, dictionary): doc_lengths = [sum([t[1] for t in doc]) for doc in corpus] term_freqs_dict = fp.merge_with(sum, *corpus) vocab = list(dictionary.token2id.keys()) # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm.. # for now, I'll just make sure we don't ever get zeros... beta = 0.01 term_freqs = [ term_freqs_dict.get(tid, beta) for tid in dictionary.token2id.values() ] gamma, _ = topic_model.inference(corpus) doc_topic_dists = _normalize(gamma) topics = topic_model.show_topics(formatted=False, num_words=len(vocab), num_topics=topic_model.num_topics) topics_df = pd.DataFrame( [dict((y, x) for x, y in tuples) for tuples in topics])[vocab] topic_term_dists = topics_df.values return { 'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists, 'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs }
def _extract_data(topic_model, corpus, dictionary): doc_lengths = [sum([t[1] for t in doc]) for doc in corpus] term_freqs_dict = fp.merge_with(sum, *corpus) vocab = fp.map(dictionary, term_freqs_dict.keys()) term_freqs = term_freqs_dict.values() gamma, _ = topic_model.inference(corpus) doc_topic_dists = np.array([r / sum(r) for r in gamma]) topics = topic_model.show_topics(formatted=False, num_words=len(vocab), num_topics=topic_model.num_topics) topics_df = pd.DataFrame( [dict((y, x) for x, y in tuples) for tuples in topics])[vocab] topic_term_dists = topics_df.values return { 'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists, 'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs }
def post_processing(mongo, batch_size=100, max_workers=50): indexer = Indexer(mongo) start_block = indexer.get_checkpoint('post_processing') query = { "block_num": { "$gt": start_block, "$lte": start_block + batch_size, } } projection = { '_id': 0, 'body': 0, 'json_metadata': 0, } results = list(mongo.Operations.find(query, projection=projection)) batches = map(parse_operation, results) # handle an edge case when we are too close to the head, # and the batch contains no work to do if not results and is_recent(start_block, days=1): return # squash for duplicates def custom_merge(*args): return list(set(keep(flatten(args)))) batch_items = merge_with(custom_merge, *batches) # only process accounts if the blocks are recent # scrape_all_users should take care of stale updates if is_recent(start_block, days=10): accounts = set(batch_items['accounts_light'] + batch_items['accounts']) list(thread_multi( fn=update_account, fn_args=[mongo, None], dep_args=list(accounts), fn_kwargs=dict(load_extras=False), max_workers=max_workers, re_raise_errors=False, )) list(thread_multi( fn=update_account_ops_quick, fn_args=[mongo, None], dep_args=list(accounts), fn_kwargs=None, max_workers=max_workers, re_raise_errors=False, )) index = max(lpluck('block_num', results)) indexer.set_checkpoint('post_processing', index) log.info("Checkpoint: %s - %s accounts (+%s full)" % ( index, len(batch_items['accounts_light']), len(batch_items['accounts']), ))
def parse_xml_for_trial_id(res_content): tree = ElementTree.fromstring(res_content) # first first child that is search results and get all child nodes that are trials trial_results = tree.find('SearchResults').findall('Trial') # trial results is a list of child nodes # the trial id is found in the .items() of each child note try: trial_ids = funcy.merge_with(list, *map(lambda x: dict(x.items()), trial_results)) # flatten list of trial ids and return return trial_ids['Id'] except KeyError as e: print trial_ids, e return None
def _extract_data(topic_model, docs): doc_lengths = [np.array(d.values()).sum() for d in docs] term_freqs_dict = fp.merge_with(sum, *docs) vocab = term_freqs_dict.keys() term_freqs = term_freqs_dict.values() doc_topic_dists = np.vstack(topic_model.predict(docs, output_type='probabilities')) topics = _topics_as_df(topic_model) topic_term_dists = topics.T[vocab].values return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists, 'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}
def _extract_data(topic_model, corpus, dictionary): doc_lengths = [sum([t[1] for t in doc]) for doc in corpus] term_freqs_dict = fp.merge_with(sum, *corpus) vocab = [dictionary[id] for id in term_freqs_dict.keys()] term_freqs = term_freqs_dict.values() gamma, _ = topic_model.inference(corpus) doc_topic_dists = _normalize(gamma) topic_term_dists = _normalize(topic_model.state.get_lambda()) return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists, 'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}
def get_vlan_usersP(bras): def _get_vlan_users(bas): funcs = {'m6k': M6k.get_vlan_users, 'me60': ME60.get_vlan_users} _gvu = partial(_model, funcs) return _gvu(bas) bras = [dict(ip=x[0], model=x[1], inf=x[2]) for x in bras] pool = Pool(len(bras)) temp = pool.map(_get_vlan_users, bras) pool.close() pool.join() temp = [x[1] for x in temp if x[1]] rslt = reduce(lambda x, y: merge_with(sum, x, y), temp) return rslt
def get_wordcloud(platform, course_code, username=None, start_date=None, end_date=None): #print "get_wordcloud", platform, course_code docs = None ids = None documents = None if username is not None: docs,ids = get_allcontent_byplatform(platform, course_code, username=username, start_date=start_date, end_date=end_date) else: docs,ids = get_allcontent_byplatform(platform, course_code, start_date=start_date, end_date=end_date) documents = remove_stopwords(docs) #print documents # Make dictionary dictionary = corpora.Dictionary(documents) #Create and save corpus corpus = [dictionary.doc2bow(text) for text in documents] #Calculate Term Frequencies term_freqs_dict = fp.merge_with(sum, *corpus) N = len(term_freqs_dict) vocab = [dictionary[id] for id in xrange(N)] freqs = [term_freqs_dict[id] for id in xrange(N)] term_freqs = zip(vocab,freqs) word_tags = [] for term_freq_pair in term_freqs: #print "term_freq_pair", term_freq_pair if ((not term_freq_pair[0].startswith('http')) or (term_freq_pair[0]=='-')): weight = 0 if type(term_freq_pair[1]) is tuple: weight = int(term_freq_pair[1][1]) else: weight = int(term_freq_pair[1]) if (weight > 3): #print weight word_tags.append('{"text": "%s", "weight": %d},' % (term_freq_pair[0], weight)) #word_tags.append('["%s", %d],' % (term_freq_pair[0], weight)) #word_tags.append('<li class="tag%d"><a href="#">%s</a></li>' % (term_freq_pair[1], term_freq_pair[0])) tags = "[" + ''.join(word_tags)[:-1] + "]" #print tags return tags
def personality_df(path=os.getcwd()): files = [] # r=root, d=directories, f = files for r, d, f in os.walk(path): for file in f: if 'profile.json' in file: files.append(os.path.join(r, file)) for num, f in enumerate(files): files[num] = json_to_dict(f) header = list(files[0].keys()) multi_dict = merge_with(list, files) df = pd.DataFramedf = pd.DataFrame(multi_dict, columns=header) return df
def _extract_data(topic_model, corpus, dictionary): doc_lengths = [sum([t[1] for t in doc]) for doc in corpus] term_freqs_dict = fp.merge_with(sum, *corpus) N = len(term_freqs_dict) vocab = [dictionary[id] for id in xrange(N)] term_freqs = [term_freqs_dict[id] for id in xrange(N)] gamma, _ = topic_model.inference(corpus) doc_topic_dists = _normalize(gamma) topic_term_dists = _normalize(topic_model.state.get_lambda()) return { 'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists, 'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs }
def apply_window(time_val): t, _ = time_val values = self[start + t:end + t].values() # Note: {} forces application of tuple. values = fn.merge_with(tuple, {}, *values) return (t, values)
def __or__(self, other): return self.evolve( data=fn.merge_with(lambda x: fn.merge(*x), self.data, other.data), start=min(self.start, other.start), end=max(self.end, other.end), )
def sum_word_vectors(v): return funcy.merge_with(sum, *v)
def deep_merge(*dicts): return F.merge_with( lambda v: deep_merge(*v) if isinstance(v[0], dict) else v[-1], *dicts)
def merge_dicts(): dict_a = {"abc": [1, 2, 3], "xyz": [4, 5]} dict_b = {"abc": [4, 5], "xyz": [1, 2, 3]} print(merge_with(lcat, dict_a, dict_b))
def invert_dict_multi_val(d: dict): """ example: {1:2, 3:2} -> {2, (1, 3)} """ return merge_with(tuple, *({val: key} for key, val in d.items()))
def summarize_evaluation(eval_dir, selection_metric="val_accuracy", ignore_worst=0): if not eval_dir.exists(): print(f"No evalutation '{eval_dir}' found.") return with open(eval_dir / "config.json") as f: config = json.load(f) with open(eval_dir / "hyperparams.json") as f: hps = json.load(f) results_dir = eval_dir / "results" assert results_dir.exists(), f"No results found for '{eval_dir}'." summary_dir = eval_dir / "summary" if not summary_dir.exists(): os.makedirs(summary_dir) result_files = [(list(fy.map(int, f[:-5].split("-"))), results_dir / f) for f in os.listdir(results_dir)] fold_files = fy.group_by(lambda f: f[0][0], result_files) fold_param_files = { fold: fy.group_by(lambda f: f[0][1], files) for fold, files in fold_files.items() } folds = list(fold_param_files.items()) folds.sort(key=fy.first) best_goal = selection_metrics[selection_metric] results = [] all_hps = True for fold_i, param_files in folds: best_res = None param_file_items = list(param_files.items()) all_hps = all_hps and len(param_files) == len(hps) for hp_i, files in param_file_items: hp_train_results = defaultdict(list) hp_test_results = defaultdict(list) selection_vals = [] all_selection_vals = [] for (_, _, i), file in files: with open(file, "r") as f: result = json.load(f) selection_val = result["train"][selection_metric][-1] all_selection_vals.append(selection_val) if i < config["repeat"]: selection_vals.append(selection_val) for metric, val in result["train"].items(): hp_train_results[metric].append(val[-1]) for metric, val in result["test"].items(): hp_test_results[metric].append(val) top_idxs = np.argsort(np.array(all_selection_vals)) if len(all_selection_vals) > ignore_worst: if best_goal == "max": top_idxs = top_idxs[ignore_worst:] elif best_goal == "min": top_idxs = top_idxs[:-ignore_worst] top_statistics = fy.compose(statistics, lambda l: np.array(l)[top_idxs]) hp_res = dict(fold_idx=fold_i, train=dict_map(top_statistics, hp_train_results), test=dict_map(top_statistics, hp_test_results), select=np.mean(selection_vals), hp_i=hp_i, hp=hps[hp_i], select_repeats=len(selection_vals), eval_repeats=len(files)) if (best_res is None or (best_goal == "max" and best_res["select"] < hp_res["select"]) or (best_goal == "min" and best_res["select"] > hp_res["select"]) or (best_res["select"] == hp_res["select"] and best_res["eval_repeats"] < hp_res["eval_repeats"])): best_res = hp_res if best_res is not None: results.append(best_res) else: print(f"No results for {fold_i}.") combined_train = dict_map( statistics, fy.merge_with( np.array, *map(lambda res: dict_map(lambda t: t["mean"], res["train"]), results))) combined_test = dict_map( statistics, fy.merge_with( np.array, *map(lambda res: dict_map(lambda t: t["mean"], res["test"]), results))) results_summary = { "folds": results, "combined_train": combined_train, "combined_test": combined_test, "args": { "ignore_worst": ignore_worst }, "done": all_hps and len(folds) == 10 } with open(summary_dir / "results.json", "w") as f: json.dump(results_summary, f, cls=NumpyEncoder, indent="\t") return results_summary