Example #1
0
def main():
    from utility_functions import get_config, get_cmd_args
    config = get_config()
    args = get_cmd_args()
    path = config['paths'][args.location][args.corpus]['path_out']
    pe = PatternExtractor(path)
    pe.extract()
Example #2
0
    def __init__(self, clusters: Dict[int, cluster_type],
                 cluster_centers: Dict[int, List[float]],
                 subcorpora: Dict[int, Set[int]], level: int) -> None:
        """Initialize a Scorer object.

        Args:
            clusters: A list of clusters. Each cluster is a set of
                term-ids.
            cluster_centers: A dict mapping the cluster-id to it's
                cluster-center.
            subcorpora: Maps each cluster label to the relevant doc-ids.
            level: Level in the taxonomy. Root is level 0.
        """
        self.config = get_config()
        self.pop_df_version = self.config['pop_df_version']
        self.pop_sum_version = self.config['pop_sum_version']
        self.pop_no_denominator = self.config['pop_no_denominator']
        self.pop_scores = self.config['pop_scores']
        self.con_scores = self.config['con_scores']
        self.l1_normalize = self.config['l1_normalize']
        self.kl_divergence = self.config['kl_divergence']
        self.clusters = clusters
        self.clusters_inv = self.inverse_cluster(clusters)
        self.cluster_centers = cluster_centers
        self.subcorpora = subcorpora
        self.level = level
        if self.clusters.keys() != self.subcorpora.keys():
            print('WARNING! Cluster and subcorpora do not correspond!')
            print('Cluster keys: ', self.clusters.keys())
            print('Subcorpora keys: ', self.subcorpora.keys())
            pdb.set_trace()
Example #3
0
def get_paths() -> Dict[str, str]:
    """Generate paths for postprocessing."""
    config = get_config()
    args = get_cmd_args()
    pout = config['paths'][args.location][args.corpus]['path_out']
    paths = {
        'out': pout,
        'tax_csv': os.path.join(pout, 'concept_terms/tax_labels_sim.csv'),
        'tax_png': os.path.join(pout, 'concept_terms/taxonomy.png')
    }
    return paths
Example #4
0
def main():
    from utility_functions import get_config, get_cmd_args
    config = get_config()
    args = get_cmd_args()
    path = config['paths'][args.location][args.corpus]['path_out']
    idxer = Indexer(path, False)  # true for tg-processing
    # idxer.index_tokens()
    # idxer.index_lemmas()
    # idxer.build_token_contains()
    # idxer.build_lemma_contains()
    idxer.hierarch_rels_to_token_idx()
Example #5
0
def main():
    config = get_config()
    args = get_cmd_args()
    path_out = config['paths'][args.location][args.corpus]['path_out']
    path_tax = os.path.join(path_out, 'hierarchy/taxonomy.csv')
    taxonomy = load_taxonomy(path_tax)
    load_term_ids_to_embs_global(config['lemmatized'], config['emb_type'],
                                 path_out)
    for node in taxonomy:
        clus_center = get_clus_center(node, path_out)
        most_center_term = get_most_center_term(node, clus_center, path_out)
        write_to_csv(node, clus_center, most_center_term, path_out)
Example #6
0
    def load_doc_embeddings(path_out: str) -> Dict[int, np.ndarray]:
        """Compute document embeddings using term-embeddings and tfidf.

        Compute document embeddings though average of tfidf weighted term
        embeddings.

        The embedding for each document d_e is computed as:
        d_e = avg(tfidf(t1..tn)*emb(t1..tn))
        where t is a term in d.

        Args:
            path_out: Path to the output directory.
        Return:
            doc_embeddings: {doc-id: embedding}
        """
        config = get_config()
        lemmatized = config['lemmatized']
        emb_type = config['embeddings']
        if not lemmatized:
            if emb_type == 'Word2Vec':
                path_doc_embs = os.path.join(
                    path_out, 'embeddings/doc_embs_token_Word2Vec.pickle')
            elif emb_type == 'GloVe':
                path_doc_embs = os.path.join(
                    path_out, 'embeddings/doc_embs_token_GloVe.pickle')
            elif emb_type == 'ELMo':
                path_doc_embs = os.path.join(
                    path_out, 'embeddings/doc_embs_token_ELMo.txt')
            else:
                raise Exception('Error! Embedding type not recognized.')
        else:
            if emb_type == 'Word2Vec':
                path_doc_embs = os.path.join(
                    path_out, 'embeddings/doc_embs_lemma_Word2Vec.pickle')
            elif emb_type == 'GloVe':
                path_doc_embs = os.path.join(
                    path_out, 'embeddings/doc_embs_lemma_GloVe.pickle')
            else:
                raise Exception('Error! Embedding type not recognized.')
        if path_doc_embs.endswith('.pickle'):
            doc_embeddings = pickle.load(open(path_doc_embs, 'rb'))
        elif path_doc_embs.endswith('.txt'):
            num_docs = 1889656
            doc_embeddings = np.empty(num_docs, 1024)
            with open(path_doc_embs, 'r', encoding='utf8') as f:
                for i, line in enumerate(f):
                    emb_str = line.split(',')
                    emb = np.array([float(f) for f in emb_str])
                    doc_embeddings[i] = emb
        else:
            raise Exception('Error! File type for embedding not known.')
        return doc_embeddings
Example #7
0
def main():
    from utility_functions import get_config, get_cmd_args  # , prep_output_dir
    config = get_config()
    args = get_cmd_args()
    path_in = config['paths'][args.location][args.corpus]['path_in']
    path_out = config['paths'][args.location][args.corpus]['path_out']
    path_lang_model = config['paths'][args.location]['path_lang_model']
    # prep_output_dir(path_out)
    max_docs = None
    # prep_output_dir(path_out)
    if args.corpus == 'dblp':
        dp = DBLPLingPreprocessor(path_in, path_out, path_lang_model, max_docs)
        dp.preprocess_corpus()
    elif args.corpus == 'sp':
        sp = SPLingPreprocessor(path_in, path_out, path_lang_model, max_docs)
        sp.preprocess_corpus()
Example #8
0
def main():
    from utility_functions import get_config, get_cmd_args
    config = get_config()
    args = get_cmd_args()
    path = config['paths'][args.location][args.corpus]['path_out']
    fa = FreqAnalyzer(path)
    print('Calculate token term frequencies...')
    fa.calc_tf('t')
    print('Calculate lemma term frequencies...')
    fa.calc_tf('l')
    print('Calculate token document frequencies...')
    fa.calc_df('t')
    print('Calculate lemma document frequencies...')
    fa.calc_df('l')
    print('Calculate token tfidf-values...')
    fa.calc_tfidf('t')
    print('Calculate lemma tfidf-values...')
    fa.calc_tfidf('l')
    print('Calculate document lengths...')
    fa.calc_dl()
    print('Done')
Example #9
0
        if t not in embedded_terms:
            not_in_et.append(t)

    if len(not_in_et) != 0:
        msg1 = 'Error! Not all terms have embeddings. '
        msg2 = 'Num terms without embeddings: {}. '.format(len(not_in_et))
        if len(not_in_et) < 20:
            msg3 = 'Terms without embeddings: {}'.format(not_in_et)
        else:
            msg3 = ''
        raise Exception(msg1 + msg2 + msg3)


def load_terms(path_terms: str) -> Set[int]:
    terms = set()
    with open(path_terms, 'r', encoding='utf8') as f:
        for line in f:
            terms.add(int(line.strip('\n')))
    return terms


if __name__ == '__main__':
    from utility_functions import get_config, get_cmd_args
    config = get_config()
    args = get_cmd_args()
    path = config['paths'][args.location][args.corpus]['path_out']
    print('Test if all token terms have embeddings...')
    test_all_token_terms_have_embeddings(path)
    print('Test if all lemma terms have embeddings...')
    test_all_lemma_terms_have_embeddings(path)
Example #10
0
def generate_taxonomy() -> None:
    """Generate a taxonomy for a preprocessed corpus.

    1. Set paths.
    2. Load data.
    3. Start recursive taxonomy generation.
    """
    # Define globals.
    global idx_to_term
    global path_embeddings_global
    global path_term_distr
    global max_depth

    # Load cmd args and configs.
    print('Load and parse cmd args...')
    config = get_config()
    args = get_cmd_args()
    lemmatized = config['lemmatized']
    emb_type = config['embeddings']
    threshold = config['threshold']
    max_depth = config['max_depth']

    # Set paths.
    print('Set paths...')
    path_out = config['paths'][args.location][args.corpus]['path_out']

    if lemmatized:
        path_term_ids = os.path.join(path_out,
                                     'processed_corpus/lemma_terms_idxs.txt')
        path_idx_to_term = os.path.join(path_out, 'indexing/idx_to_lemma.json')
        path_df = os.path.join(path_out, 'frequencies/df_lemmas.json')
        # path_tf = os.path.join(path_out, 'frequencies/tf_lemmas.json')
        # path_tfidf = os.path.join(
        #     path_out, 'frequencies/tfidf_lemmas.json')
        path_term_distr = os.path.join(path_out,
                                       'frequencies/term_distr_lemmas.json')
        path_base_corpus = os.path.join(
            path_out, 'processed_corpus/pp_lemma_corpus.txt')
        path_base_corpus_ids = os.path.join(
            path_out, 'processed_corpus/lemma_idx_corpus.txt')
        if emb_type == 'GloVe' or emb_type == 'Word2Vec':
            path_embeddings_global = os.path.join(
                path_out,
                'embeddings/embs_lemma_global_{}.vec'.format(emb_type))
        else:
            path_embeddings_global = os.path.join(
                path_out,
                'embeddings/embs_lemma_global_{}.pickle'.format(emb_type))
    else:
        path_term_ids = os.path.join(path_out,
                                     'processed_corpus/token_terms_idxs.txt')
        path_idx_to_term = os.path.join(path_out, 'indexing/idx_to_token.json')
        path_df = os.path.join(path_out, 'frequencies/df_tokens.json')
        # path_tf = os.path.join(path_out, 'frequencies/tf_tokens.json')
        # path_tfidf = os.path.join(path_out, 'frequencies/tfidf_tokens.json')
        path_term_distr = os.path.join(path_out,
                                       'frequencies/term_distr_tokens.json')
        path_base_corpus = os.path.join(
            path_out, 'processed_corpus/pp_token_corpus.txt')
        path_base_corpus_ids = os.path.join(
            path_out, 'processed_corpus/token_idx_corpus.txt')
        if emb_type == 'GloVe' or emb_type == 'Word2Vec':
            path_embeddings_global = os.path.join(
                path_out,
                'embeddings/embs_token_global_{}.vec'.format(emb_type))
        else:
            path_embeddings_global = os.path.join(
                path_out,
                'embeddings/embs_token_{}_avg.pickle'.format(emb_type))

    # path_dl = os.path.join(path_out, 'frequencies/dl.json')
    path_taxonomy = os.path.join(path_out, 'hierarchy/taxonomy.csv')

    tax_file = open(path_taxonomy, 'w', encoding='utf8', newline='')
    csv_writer = csv.writer(tax_file, delimiter=',')

    # Define starting variables.
    print('Load term-ids...')
    term_ids = load_term_ids(path_term_ids)
    print('Load idx-term mappings...')
    with open(path_idx_to_term, 'r', encoding='utf8') as f:
        idx_to_term_str = json.load(f)
        idx_to_term = {int(k): v for k, v in idx_to_term_str.items()}
    print('Load global embeddings...')
    term_ids_to_embs_global = Embeddings.load_term_embeddings(
        term_ids, path_embeddings_global, idx_to_term)

    print('Load base corpus...')
    base_corpus = get_base_corpus(path_base_corpus)
    print('Load df-base...')
    with open(path_df, 'r', encoding='utf8') as f:
        # {word_id: [doc_id1, ...]}
        df_base_str = json.load(f)
        df_base = {int(k): [int(i) for i in v] for k, v in df_base_str.items()}

    print('load term distr file...')
    global term_distr_base
    term_distr_base = pickle.load(open(path_term_distr, 'rb'))

    del df_base_str

    # Start recursive taxonomy generation.
    rec_find_children(
        term_ids_local=term_ids,
        term_ids_global=term_ids,
        base_corpus=base_corpus,
        path_base_corpus_ids=path_base_corpus_ids,
        cur_node_id=0,
        level=0,
        df_base=df_base,
        df=df_base,
        # cur_repr_terms=[],
        path_out=path_out,
        cur_corpus=base_corpus,
        csv_writer=csv_writer,
        threshold=threshold,
        term_ids_to_embs_global=term_ids_to_embs_global,
        emb_type=emb_type,
        max_iter=config['max_iter'])

    tax_file.close()

    print('Done.')
Example #11
0
def main():
    args = get_cmd_args()
    location = args.location
    corpus = args.corpus
    config = get_config()
    path_out = config['paths'][location][corpus]['path_out']
    emb_type = config['embeddings']

    if not args.skip_prep:
        prep_output_dir(path_out)

    # Copy TG files into dir-system.
    papers_to_pp_token_corpus(config, location, corpus)
    copy_keywords_to_terms(config, location, corpus)

    # Index corpus.
    if not args.skip_idxer:
        # print('Start indexing...')
        idxer = Indexer(path_out)
        # idxer.index_tokens()
        # print('Finished indexing.')
        print('Start building subtoken index...')
        idxer.build_token_contains()
        print('Finished building subtoken index.')

    # Frequency analysis.
    if not args.skip_freq_an:
        print('Start frequency analysis for tf, df and dl...')
        fa = FreqAnalyzer(path_out)
        print('Calculate token term frequencies...')
        fa.calc_tf('t')
        print('Calculate token document frequencies...')
        fa.calc_df('t')
        print('Calculate tfidf for tokens...')
        fa.calc_tfidf('t')
        print('Calculate document lengths...')
        fa.calc_dl()
        print('Finished frequency analysis.')

    if not args.skip_embeddings:
        emb_types = ['Word2Vec', 'GloVe', 'ELMo']
        for etype in emb_types:
            Embedding = get_emb(etype)
            print('Train {} token embeddings...'.format(etype))
            path_input = os.path.join(path_out,
                                      'processed_corpus/token_idx_corpus.txt')
            embs_fname = Embedding.train(
                path_input, 'embs_token_global_'+etype, path_out)
            print('{} embeddings written to: {}'.format(etype, embs_fname))

    if not args.skip_doc_embs:
        print('Calculating document embeddings...')
        doc_embedder = DocEmbedder(path_out, emb_type)
        doc_embedder.embed_token_docs()
        print('Finished document embeddings.')

    if not args.skip_word_distr:
        print('Create term distributions pickle file...')

        path_tf = os.path.join(path_out, 'frequencies/tf_tokens.json')
        path_tfidf = os.path.join(path_out, 'frequencies/tfidf_tokens.json')
        path_dl = os.path.join(path_out, 'frequencies/dl.json')
        path_term_distr = os.path.join(
            path_out, 'frequencies/term_distr_tokens.json')

        # Load frequencies.
        with open(path_tf, 'r', encoding='utf8') as f_tf:
            tf_base = json.load(f_tf)
            with open(path_tfidf, 'r', encoding='utf8') as f_tfidf:
                tfidf_base = json.load(f_tfidf)
                with open(path_dl, 'r', encoding='utf8') as f_dl:
                    dl_base = json.load(f_dl)

        # Create term_distr.
        for doc_id in tfidf_base:
            for word_id in tf_base[doc_id]:
                tf = tf_base[doc_id][word_id]
                tfidf = tfidf_base[doc_id][word_id]
                term_distr_base[int(doc_id)][int(word_id)] = (tf, tfidf)
            term_distr_base[int(doc_id)][-1] = dl_base[doc_id]

        # Dump term_distr.
        with open(path_term_distr, 'wb') as f:
            pickle.dump(term_distr_base, f)
Example #12
0
def main():
    global idx_to_term
    from utility_functions import get_config, get_cmd_args
    config = get_config()
    args = get_cmd_args()
    path_out = config['paths'][args.location][args.corpus]['path_out']

    path_idx_to_term = os.path.join(path_out, 'indexing/idx_to_token.json')
    print('Load idx-term mappings...')
    with open(path_idx_to_term, 'r', encoding='utf8') as f:
        idx_to_term_str = json.load(f)
        idx_to_term = {int(k): v for k, v in idx_to_term_str.items()}

    taxonomy = load_taxonomy(path_out)

    global ls
    ls = LabelScorer(config, args)

    # Run labeling with repr score as metric.
    print('Run labeling with repr score as metric...')
    path_tax_frep = os.path.join(path_out, 'concept_terms/tax_labels_repr.csv')
    tax_label_file = open(path_tax_frep, 'w', encoding='utf8')
    csv_writer = csv.writer(tax_label_file, delimiter=',')
    rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=False,
                    label_score=False, hypo_score=False, incl_score=False)

    # Run labeling with cosine similarity as metric.
    print('Run labeling with sim score as metric...')
    path_tax_fsim = os.path.join(path_out, 'concept_terms/tax_labels_sim.csv')
    tax_label_file = open(path_tax_fsim, 'w', encoding='utf8')
    csv_writer = csv.writer(tax_label_file, delimiter=',')
    rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=True,
                    label_score=False, hypo_score=False, incl_score=False)

    # All ls.
    print('Run labeling with label score as metric...')
    path_tax_fsim = os.path.join(path_out, 'concept_terms/tax_labels_ls.csv')
    tax_label_file = open(path_tax_fsim, 'w', encoding='utf8')
    csv_writer = csv.writer(tax_label_file, delimiter=',')
    rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=True,
                    label_score=True, hypo_score=True, incl_score=True)

    # No cos.
    print('Run labeling with label score but without cos score as metric...')
    path_tax_fsim = os.path.join(
        path_out, 'concept_terms/tax_labels_ls_no_cos.csv')
    tax_label_file = open(path_tax_fsim, 'w', encoding='utf8')
    csv_writer = csv.writer(tax_label_file, delimiter=',')
    rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=False,
                    label_score=True, hypo_score=True, incl_score=True)

    # No hypo.
    print('Run labeling with label score but without hypo score as metric...')
    path_tax_fsim = os.path.join(
        path_out, 'concept_terms/tax_labels_ls_no_hypo.csv')
    tax_label_file = open(path_tax_fsim, 'w', encoding='utf8')
    csv_writer = csv.writer(tax_label_file, delimiter=',')
    rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=True,
                    label_score=True, hypo_score=False, incl_score=True)

    # No incl.
    print('Run labeling with label score but without incl score as metric...')
    path_tax_fsim = os.path.join(
        path_out, 'concept_terms/tax_labels_ls_no_incl.csv')
    tax_label_file = open(path_tax_fsim, 'w', encoding='utf8')
    csv_writer = csv.writer(tax_label_file, delimiter=',')
    rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=True,
                    label_score=True, hypo_score=True, incl_score=False)

    # Only hypo
    print('Run labeling with label score but only hypo...')
    path_tax_fsim = os.path.join(
        path_out, 'concept_terms/tax_labels_ls_only_hypo.csv')
    tax_label_file = open(path_tax_fsim, 'w', encoding='utf8')
    csv_writer = csv.writer(tax_label_file, delimiter=',')
    rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=False,
                    label_score=True, hypo_score=True, incl_score=False)

    # Only incl
    print('Run labeling with label score but only incl score...')
    path_tax_fsim = os.path.join(
        path_out, 'concept_terms/tax_labels_ls_only_incl.csv')
    tax_label_file = open(path_tax_fsim, 'w', encoding='utf8')
    csv_writer = csv.writer(tax_label_file, delimiter=',')
    rec_find_labels(path_out, taxonomy, 10, [], 0, csv_writer, cos=False,
                    label_score=True, hypo_score=False, incl_score=True)
    print('Done')