Esempio n. 1
0
    def _can_merge(self, graph: nx.Graph, pair: tuple[Node, Node]) -> bool:
        succ = self.tree.neighbors
        nodes = self.tree.nodes

        stack, visited = [pair], set()
        while stack:  # DFS for inconsistency in states.
            left, right = stack.pop()

            if (left, right) in visited:
                continue
            visited.add((left, right))

            if (left, right) in graph.edges:
                return False  # Reached known distinguished nodes.

            left_lbl = nodes[left].get('label')
            right_lbl = nodes[right].get('label')
            if None not in {left_lbl, right_lbl} and left_lbl != right_lbl:
                return False  # Discovered distiguishing path.

            # Group neighbors by access token.
            succ_left = {nodes[n]['source']: n for n in succ(left)}
            succ_right = {nodes[n]['source']: n for n in succ(right)}
            merged = list(fn.merge_with(set, succ_left, succ_right).values())

            # Interchange pair[0] and pair[1] is applicable.
            for p1, p2 in [pair, pair[::-1]]:
                merged.extend([(p | {p1}) - {p2} for p in merged if p2 in p])

            # Add un-reconciled successors to stack.
            stack.extend([p for p in merged if len(p) == 2])

        return True
Esempio n. 2
0
def _extract_data(topic_model, corpus, dictionary):
    doc_lengths = [sum([t[1] for t in doc]) for doc in corpus]

    term_freqs_dict = fp.merge_with(sum, *corpus)

    vocab = list(dictionary.token2id.keys())
    # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
    # for now, I'll just make sure we don't ever get zeros...
    beta = 0.01
    term_freqs = [term_freqs_dict.get(tid, beta) for tid in dictionary.token2id.values()]

    gamma, _ = topic_model.inference(corpus)
    doc_topic_dists = _normalize(gamma)

    topics = topic_model.show_topics(formatted=False, num_words=len(vocab), num_topics=topic_model.num_topics)
    topics_df = pd.DataFrame([dict((y, x) for x, y in tuples) for tuples in topics])[vocab]
    topic_term_dists = topics_df.values

    return {
        "topic_term_dists": topic_term_dists,
        "doc_topic_dists": doc_topic_dists,
        "doc_lengths": doc_lengths,
        "vocab": vocab,
        "term_frequency": term_freqs,
    }
Esempio n. 3
0
def _extract_data(topic_model, corpus, dictionary):
   doc_lengths = [sum([t[1] for t in doc]) for doc in corpus]

   term_freqs_dict = fp.merge_with(sum, *corpus)

   vocab = [dictionary[id] for id in term_freqs_dict.keys()]
   term_freqs = term_freqs_dict.values()

   if str(type(topic_model)) == "<class 'gensim.models.ldamodel.LdaModel'>":
      gamma, _ = topic_model.inference(corpus)
      doc_topic_dists = _normalize(gamma)

      topic_term_dists = _normalize(topic_model.state.get_lambda())
   elif str(type(topic_model)) == "<class 'gensim.models.hdpmodel.HdpModel'>":
      gamma = topic_model.inference(corpus)
      doc_topic_dists = _normalize(gamma)

      topic_model.update_expectations()
      topic_term_dists = _normalize(topic_model.m_lambda)
   else:
      raise("Unknown gensim model")

   return {
      'topic_term_dists': topic_term_dists,
      'doc_topic_dists': doc_topic_dists,
      'doc_lengths': doc_lengths,
      'vocab': vocab,
      'term_frequency': term_freqs
   }
Esempio n. 4
0
def _extract_data(topic_model, corpus, dictionary):
    doc_lengths = [sum([t[1] for t in doc]) for doc in corpus]

    term_freqs_dict = fp.merge_with(sum, *corpus)

    vocab = list(dictionary.token2id.keys())
    # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
    # for now, I'll just make sure we don't ever get zeros...
    beta = 0.01
    term_freqs = [
        term_freqs_dict.get(tid, beta) for tid in dictionary.token2id.values()
    ]

    gamma, _ = topic_model.inference(corpus)
    doc_topic_dists = _normalize(gamma)

    topics = topic_model.show_topics(formatted=False,
                                     num_words=len(vocab),
                                     num_topics=topic_model.num_topics)
    topics_df = pd.DataFrame(
        [dict((y, x) for x, y in tuples) for tuples in topics])[vocab]
    topic_term_dists = topics_df.values

    return {
        'topic_term_dists': topic_term_dists,
        'doc_topic_dists': doc_topic_dists,
        'doc_lengths': doc_lengths,
        'vocab': vocab,
        'term_frequency': term_freqs
    }
Esempio n. 5
0
def _extract_data(topic_model, corpus, dictionary):
    doc_lengths = [sum([t[1] for t in doc]) for doc in corpus]

    term_freqs_dict = fp.merge_with(sum, *corpus)

    vocab = fp.map(dictionary, term_freqs_dict.keys())
    term_freqs = term_freqs_dict.values()

    gamma, _ = topic_model.inference(corpus)
    doc_topic_dists = np.array([r / sum(r) for r in gamma])

    topics = topic_model.show_topics(formatted=False,
                                     num_words=len(vocab),
                                     num_topics=topic_model.num_topics)
    topics_df = pd.DataFrame(
        [dict((y, x) for x, y in tuples) for tuples in topics])[vocab]
    topic_term_dists = topics_df.values

    return {
        'topic_term_dists': topic_term_dists,
        'doc_topic_dists': doc_topic_dists,
        'doc_lengths': doc_lengths,
        'vocab': vocab,
        'term_frequency': term_freqs
    }
Esempio n. 6
0
def post_processing(mongo, batch_size=100, max_workers=50):
    indexer = Indexer(mongo)
    start_block = indexer.get_checkpoint('post_processing')

    query = {
        "block_num": {
            "$gt": start_block,
            "$lte": start_block + batch_size,
        }
    }
    projection = {
        '_id': 0,
        'body': 0,
        'json_metadata': 0,
    }
    results = list(mongo.Operations.find(query, projection=projection))
    batches = map(parse_operation, results)

    # handle an edge case when we are too close to the head,
    # and the batch contains no work to do
    if not results and is_recent(start_block, days=1):
        return

    # squash for duplicates
    def custom_merge(*args):
        return list(set(keep(flatten(args))))

    batch_items = merge_with(custom_merge, *batches)

    # only process accounts if the blocks are recent
    # scrape_all_users should take care of stale updates
    if is_recent(start_block, days=10):
        accounts = set(batch_items['accounts_light'] +
                       batch_items['accounts'])
        list(thread_multi(
            fn=update_account,
            fn_args=[mongo, None],
            dep_args=list(accounts),
            fn_kwargs=dict(load_extras=False),
            max_workers=max_workers,
            re_raise_errors=False,
        ))
        list(thread_multi(
            fn=update_account_ops_quick,
            fn_args=[mongo, None],
            dep_args=list(accounts),
            fn_kwargs=None,
            max_workers=max_workers,
            re_raise_errors=False,
        ))

    index = max(lpluck('block_num', results))
    indexer.set_checkpoint('post_processing', index)

    log.info("Checkpoint: %s - %s accounts (+%s full)" % (
        index,
        len(batch_items['accounts_light']),
        len(batch_items['accounts']),
    ))
Esempio n. 7
0
def parse_xml_for_trial_id(res_content):
    tree = ElementTree.fromstring(res_content)
    # first first child that is search results and get all child nodes that are trials
    trial_results = tree.find('SearchResults').findall('Trial')
    # trial results is a list of child nodes
    # the trial id is found in the .items() of each child note
    try:
        trial_ids = funcy.merge_with(list, *map(lambda x: dict(x.items()), trial_results))
        # flatten list of trial ids and return
        return trial_ids['Id']
    except KeyError as e:
        print trial_ids, e
        return None
Esempio n. 8
0
def _extract_data(topic_model, docs):
   doc_lengths = [np.array(d.values()).sum() for d in docs]

   term_freqs_dict = fp.merge_with(sum, *docs)
   vocab = term_freqs_dict.keys()
   term_freqs = term_freqs_dict.values()

   doc_topic_dists = np.vstack(topic_model.predict(docs, output_type='probabilities'))

   topics = _topics_as_df(topic_model)
   topic_term_dists = topics.T[vocab].values

   return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists,
           'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}
Esempio n. 9
0
def _extract_data(topic_model, docs):
   doc_lengths = [np.array(d.values()).sum() for d in docs]

   term_freqs_dict = fp.merge_with(sum, *docs)
   vocab = term_freqs_dict.keys()
   term_freqs = term_freqs_dict.values()

   doc_topic_dists = np.vstack(topic_model.predict(docs, output_type='probabilities'))

   topics = _topics_as_df(topic_model)
   topic_term_dists = topics.T[vocab].values

   return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists,
           'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}
Esempio n. 10
0
def _extract_data(topic_model, corpus, dictionary):
   doc_lengths = [sum([t[1] for t in doc]) for doc in corpus]

   term_freqs_dict = fp.merge_with(sum, *corpus)

   vocab = [dictionary[id] for id in term_freqs_dict.keys()]
   term_freqs = term_freqs_dict.values()

   gamma, _ = topic_model.inference(corpus)
   doc_topic_dists = _normalize(gamma)

   topic_term_dists = _normalize(topic_model.state.get_lambda())

   return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists,
           'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}
Esempio n. 11
0
def get_vlan_usersP(bras):
    def _get_vlan_users(bas):
        funcs = {'m6k': M6k.get_vlan_users,
                 'me60': ME60.get_vlan_users}
        _gvu = partial(_model, funcs)
        return _gvu(bas)

    bras = [dict(ip=x[0], model=x[1], inf=x[2])
            for x in bras]
    pool = Pool(len(bras))
    temp = pool.map(_get_vlan_users, bras)
    pool.close()
    pool.join()
    temp = [x[1] for x in temp if x[1]]
    rslt = reduce(lambda x, y: merge_with(sum, x, y), temp)
    return rslt
Esempio n. 12
0
def get_wordcloud(platform, course_code, username=None, start_date=None, end_date=None):
    #print "get_wordcloud", platform, course_code
    docs = None
    ids = None
    documents = None
    if username is not None:
        docs,ids = get_allcontent_byplatform(platform, course_code, username=username, start_date=start_date, end_date=end_date)
    else:
        docs,ids = get_allcontent_byplatform(platform, course_code, start_date=start_date, end_date=end_date)

    documents = remove_stopwords(docs)
    #print documents
    # Make dictionary
    dictionary = corpora.Dictionary(documents)

    #Create and save corpus
    corpus = [dictionary.doc2bow(text) for text in documents]

    #Calculate Term Frequencies
    term_freqs_dict = fp.merge_with(sum, *corpus)
    N = len(term_freqs_dict)

    vocab = [dictionary[id] for id in xrange(N)]
    freqs = [term_freqs_dict[id] for id in xrange(N)]

    term_freqs = zip(vocab,freqs)
    word_tags = []

    for term_freq_pair in term_freqs:
        #print "term_freq_pair", term_freq_pair
        if ((not term_freq_pair[0].startswith('http')) or (term_freq_pair[0]=='-')):
            weight = 0
            if type(term_freq_pair[1]) is tuple:
                weight = int(term_freq_pair[1][1])
            else:
                weight = int(term_freq_pair[1])

            if (weight > 3):
                #print weight
                word_tags.append('{"text": "%s", "weight": %d},' % (term_freq_pair[0], weight))
                #word_tags.append('["%s", %d],' % (term_freq_pair[0], weight))
            #word_tags.append('<li class="tag%d"><a href="#">%s</a></li>' % (term_freq_pair[1], term_freq_pair[0]))
    tags = "[" + ''.join(word_tags)[:-1] + "]"
    #print tags
    return tags
Esempio n. 13
0
def personality_df(path=os.getcwd()):
    files = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if 'profile.json' in file:
                files.append(os.path.join(r, file))

    for num, f in enumerate(files):
        files[num] = json_to_dict(f)

    header = list(files[0].keys())

    multi_dict = merge_with(list, files)

    df = pd.DataFramedf = pd.DataFrame(multi_dict, columns=header)

    return df
Esempio n. 14
0
def _extract_data(topic_model, corpus, dictionary):
    doc_lengths = [sum([t[1] for t in doc]) for doc in corpus]

    term_freqs_dict = fp.merge_with(sum, *corpus)
    N = len(term_freqs_dict)

    vocab = [dictionary[id] for id in xrange(N)]
    term_freqs = [term_freqs_dict[id] for id in xrange(N)]

    gamma, _ = topic_model.inference(corpus)
    doc_topic_dists = _normalize(gamma)

    topic_term_dists = _normalize(topic_model.state.get_lambda())

    return {
        'topic_term_dists': topic_term_dists,
        'doc_topic_dists': doc_topic_dists,
        'doc_lengths': doc_lengths,
        'vocab': vocab,
        'term_frequency': term_freqs
    }
Esempio n. 15
0
 def apply_window(time_val):
     t, _ = time_val
     values = self[start + t:end + t].values()
     # Note: {} forces application of tuple.
     values = fn.merge_with(tuple, {}, *values)
     return (t, values)
Esempio n. 16
0
 def __or__(self, other):
     return self.evolve(
         data=fn.merge_with(lambda x: fn.merge(*x), self.data, other.data),
         start=min(self.start, other.start),
         end=max(self.end, other.end),
     )
Esempio n. 17
0
def sum_word_vectors(v):
    return funcy.merge_with(sum, *v)
Esempio n. 18
0
def deep_merge(*dicts):
    return F.merge_with(
        lambda v: deep_merge(*v) if isinstance(v[0], dict) else v[-1], *dicts)
Esempio n. 19
0
def merge_dicts():

    dict_a = {"abc": [1, 2, 3], "xyz": [4, 5]}
    dict_b = {"abc": [4, 5], "xyz": [1, 2, 3]}

    print(merge_with(lcat, dict_a, dict_b))
Esempio n. 20
0
def invert_dict_multi_val(d: dict):
    """
    example: {1:2, 3:2} -> {2, (1, 3)}
    """
    return merge_with(tuple, *({val: key} for key, val in d.items()))
Esempio n. 21
0
def summarize_evaluation(eval_dir,
                         selection_metric="val_accuracy",
                         ignore_worst=0):
    if not eval_dir.exists():
        print(f"No evalutation '{eval_dir}' found.")
        return

    with open(eval_dir / "config.json") as f:
        config = json.load(f)

    with open(eval_dir / "hyperparams.json") as f:
        hps = json.load(f)

    results_dir = eval_dir / "results"
    assert results_dir.exists(), f"No results found for '{eval_dir}'."
    summary_dir = eval_dir / "summary"

    if not summary_dir.exists():
        os.makedirs(summary_dir)

    result_files = [(list(fy.map(int, f[:-5].split("-"))), results_dir / f)
                    for f in os.listdir(results_dir)]

    fold_files = fy.group_by(lambda f: f[0][0], result_files)
    fold_param_files = {
        fold: fy.group_by(lambda f: f[0][1], files)
        for fold, files in fold_files.items()
    }
    folds = list(fold_param_files.items())
    folds.sort(key=fy.first)

    best_goal = selection_metrics[selection_metric]

    results = []
    all_hps = True

    for fold_i, param_files in folds:
        best_res = None
        param_file_items = list(param_files.items())

        all_hps = all_hps and len(param_files) == len(hps)

        for hp_i, files in param_file_items:
            hp_train_results = defaultdict(list)
            hp_test_results = defaultdict(list)
            selection_vals = []
            all_selection_vals = []
            for (_, _, i), file in files:
                with open(file, "r") as f:
                    result = json.load(f)

                selection_val = result["train"][selection_metric][-1]
                all_selection_vals.append(selection_val)
                if i < config["repeat"]:
                    selection_vals.append(selection_val)

                for metric, val in result["train"].items():
                    hp_train_results[metric].append(val[-1])
                for metric, val in result["test"].items():
                    hp_test_results[metric].append(val)

            top_idxs = np.argsort(np.array(all_selection_vals))

            if len(all_selection_vals) > ignore_worst:
                if best_goal == "max":
                    top_idxs = top_idxs[ignore_worst:]
                elif best_goal == "min":
                    top_idxs = top_idxs[:-ignore_worst]

            top_statistics = fy.compose(statistics,
                                        lambda l: np.array(l)[top_idxs])

            hp_res = dict(fold_idx=fold_i,
                          train=dict_map(top_statistics, hp_train_results),
                          test=dict_map(top_statistics, hp_test_results),
                          select=np.mean(selection_vals),
                          hp_i=hp_i,
                          hp=hps[hp_i],
                          select_repeats=len(selection_vals),
                          eval_repeats=len(files))

            if (best_res is None or
                (best_goal == "max" and best_res["select"] < hp_res["select"])
                    or
                (best_goal == "min" and best_res["select"] > hp_res["select"])
                    or
                (best_res["select"] == hp_res["select"]
                 and best_res["eval_repeats"] < hp_res["eval_repeats"])):
                best_res = hp_res

        if best_res is not None:
            results.append(best_res)
        else:
            print(f"No results for {fold_i}.")

    combined_train = dict_map(
        statistics,
        fy.merge_with(
            np.array,
            *map(lambda res: dict_map(lambda t: t["mean"], res["train"]),
                 results)))
    combined_test = dict_map(
        statistics,
        fy.merge_with(
            np.array,
            *map(lambda res: dict_map(lambda t: t["mean"], res["test"]),
                 results)))

    results_summary = {
        "folds": results,
        "combined_train": combined_train,
        "combined_test": combined_test,
        "args": {
            "ignore_worst": ignore_worst
        },
        "done": all_hps and len(folds) == 10
    }

    with open(summary_dir / "results.json", "w") as f:
        json.dump(results_summary, f, cls=NumpyEncoder, indent="\t")

    return results_summary