def text_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}
        # 1. create a TextRank extractor.
        extractor = pke.unsupervised.TextRank()

        if corpus.language == Language.DE:
            lan = "de"
        else:
            lan = "en"

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating TextRank"):

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            # 3. build the graph representation of the document and rank the words.
            #    Keyphrase candidates are composed from the 33-percent
            #    highest-ranked words.
            extractor.candidate_weighting(window=2, pos=pos, top_percent=0.33)

            # 4. get the 10-highest scored candidates as keyphrases
            # keyphrases = extractor.get_n_best(n=top_k)

            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)

        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.TEXT_RANK_PKE)
Beispiel #2
0
def main():

    # load trained model
    print 'Loading neural machine translator with attention:'
    train_set = Corpus(TRAIN_SRC, TRAIN_TGT)
    dev_set = Corpus(DEV_SRC, DEV_TGT)
    m = load_model(train_set, dev_set, MODEL)
    print 'Model loaded!'

    # dev_set.target_sentences = dev_set.target_sentences[100:200]
    # dev_set.source_sentences = dev_set.source_sentences[100:200]

    # translate sentence
    print '\nTranslating . . .\n'
    sample_output = np.random.choice(len(dev_set.target_sentences), 5, False)
    greedy, beam = gen_all(m, dev_set.source_sentences, BSIZE)
    for sample in sample_output:
        print 'Target: {}'.format(' '.join(dev_set.target_sentences[sample]))
        print 'Greedy: {}'.format(' '.join(greedy[sample]))
        print 'Beam search: {}'.format(' '.join(beam[sample]))
        print '----------'

    greedy_score = get_bleu_score(greedy, dev_set.target_sentences)
    beam_score = get_bleu_score(beam, dev_set.target_sentences)

    print 'Greedy bleu score: ', greedy_score
    print 'Beam search bleu score: ', beam_score
    def tfidf_skl(cls, corpus: Corpus):
        if corpus.language == Language.EN:
            stop_words = stopwords.words("english")
        elif corpus.language == Language.DE:
            stop_words = stopwords.words("german")
        else:
            raise UserWarning("No stopwords for language!")

        tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,
                                           ngram_range=(cls.min_nrgam,
                                                        cls.max_ngram),
                                           min_df=2)
        tfidf_matrix = tfidf_vectorizer.fit_transform(
            [document.text for document in corpus.get_documents(as_list=True)])
        doc_id_lookup = {
            i: document.doc_id
            for i, document in enumerate(corpus.get_documents(as_list=True))
        }

        features = tfidf_vectorizer.get_feature_names()

        keywords = {}
        for i, doc in tqdm(enumerate(tfidf_matrix),
                           desc="Calculating tf-idf",
                           total=tfidf_matrix.shape[0]):
            df = pd.DataFrame(doc.T.todense(),
                              index=features,
                              columns=["tfidf"])
            top_key_words = df.sort_values(by=["tfidf"],
                                           ascending=False)[:cls.top_k]
            keywords[doc_id_lookup[i]] = list(top_key_words.index)

        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.TFIDF_SKL)
def yearwise_documents(corpus: Corpus,
                       aggregation_func: Callable = len,
                       printing: bool = False,
                       as_dict: bool = False):
    year_bins = defaultdict(list)

    for doc in corpus.get_documents():
        year_bins[doc.date].append(doc)

    result = {
        year: aggregation_func(
            Corpus(source=docs,
                   language=corpus.language,
                   name=f'{corpus.name}_yearwise'))
        for year, docs in year_bins.items() if year is not None
    }
    result = OrderedDict(sorted(result.items()))

    if as_dict:
        return result

    years = []
    counts = []
    for year, count in result.items():
        years.append(year)
        counts.append(count)
        if printing:
            print(f'{year}: {count}')

    # print(years)
    # print(counts)
    return years, counts
def parse_and_preprocess_src(data_source, corpus_destination, preprocess=True):
    if re.search("bundestag", data_source.lower()):
        name = "bundestag"
        raw_corpus = DataHandler.get_bundestag_speeches(directory=data_source)
    elif re.search("sustainability", data_source.lower()):
        name = "sustainability"
        raw_corpus = DataHandler.get_sustainability_data(path=data_source)
    elif re.search("unv1.0-tei", data_source.lower()):
        name = "united_nations"
        raw_corpus = DataHandler.get_un_texts(directory=data_source)
    elif re.search("state_of_the_union", data_source.lower()):
        name = "state_of_the_union"
        raw_corpus = DataHandler.get_state_of_the_union(directory=data_source)
    else:
        name = "abstracts"
        raw_corpus = DataHandler.get_abstracts(path=data_source)

    language = raw_corpus[0].language
    print('loaded', len(raw_corpus), 'documents')
    if preprocess:
        Preprocessor.preprocess(raw_corpus, language=language)
        print('preprocessed', len(raw_corpus), 'documents')
    corpus = Corpus(source=raw_corpus, language=language, name=name)
    print('parsed', len(corpus.get_documents(as_list=True)),
          'documents to a Corpus')
    corpus.save_corpus(corpus_destination)
    def single_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}
        # 1. create a SingleRank extractor.
        extractor = pke.unsupervised.SingleRank()

        if corpus.language == Language.DE:
            lan = "de"
        else:
            lan = "en"

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating SingleRank"):

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            # 3. select the longest sequences of nouns and adjectives as candidates.
            extractor.candidate_selection(pos=pos)

            # 4. weight the candidates using the sum of their word's scores that are
            #    computed using random walk. In the graph, nodes are words of
            #    certain part-of-speech (nouns and adjectives) that are connected if
            #    they occur in a window of 10 words.
            extractor.candidate_weighting(window=10, pos=pos)

            # 5. get the 10-highest scored candidates as keyphrases
            # keyphrases = extractor.get_n_best(n=top_k)
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.SINGLE_RANK_PKE)
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.SINGLE_RANK_PKE)
Beispiel #7
0
def main(train_src_file, train_tgt_file, dev_src_file, dev_tgt_file, model_file, num_epochs, embeddings_init = None, seed = 0):
    print('reading train corpus ...')
    train_set = Corpus(train_src_file, train_tgt_file)
    # assert()
    print('reading dev corpus ...')
    dev_set = Corpus(dev_src_file, dev_tgt_file)

    # test_set = Corpus(test_src_file)

    print 'Initializing neural machine translator with attention:'
    # src_vocab_size, tgt_vocab_size, tgt_idx2word, word_d, gru_d, gru_layers
    encoder_decoder = nmt_dynet_attention(len(train_set.source_word2idx), len(train_set.target_word2idx), train_set.source_word2idx, train_set.source_idx2word, train_set.target_word2idx, train_set.target_idx2word, 50, 50, 2)

    trainer = SimpleSGDTrainer(encoder_decoder.model)

    sample_output = np.random.choice(len(dev_set.target_sentences), 5, False)
    losses = []
    best_bleu_score = 0
    for epoch in range(num_epochs):
        print 'Starting epoch', epoch
        # shuffle the training data
        combined = list(zip(train_set.source_sentences, train_set.target_sentences))
        random.shuffle(combined)
        train_set.source_sentences[:], train_set.target_sentences[:] = zip(*combined)

        print 'Training . . .'
        sentences_processed = 0
        for src_sentence, tgt_sentence in zip(train_set.source_sentences, train_set.target_sentences):
            loss = encoder_decoder.get_loss(src_sentence, tgt_sentence)
            loss_value = loss.value()
            loss.backward()
            trainer.update()
            sentences_processed += 1
            if sentences_processed % 4000 == 0:
                print 'sentences processed: ', sentences_processed

        # Accumulate average losses over training to plot
        val_loss = get_val_set_loss(encoder_decoder, dev_set)
        print 'Validation loss this epoch', val_loss
        losses.append(val_loss)

        print 'Translating . . .'
        translated_sentences = encoder_decoder.translate_all(dev_set.source_sentences)

        print('translating {} source sentences...'.format(len(sample_output)))
        for sample in sample_output:
            print('Target: {}\nTranslation: {}\n'.format(' '.join(dev_set.target_sentences[sample]),
                                                                         ' '.join(translated_sentences[sample])))

        bleu_score = get_bleu_score(translated_sentences, dev_set.target_sentences)
        print 'bleu score: ', bleu_score
        if bleu_score > best_bleu_score:
            best_bleu_score = bleu_score
            # save the model
            encoder_decoder.save(model_file)

    print 'best bleu score: ', best_bleu_score
def cleaning_authors(config, overwrite=False):
    corpus_names = [
        "bundestag_corpus",
        # "sustainability_corpus",
        # "abstract_corpus"
    ]
    languages = [Language.DE, Language.EN, Language.EN]
    wlc = 0
    m_a = 0
    s_a = 0
    for i, corpus_name in enumerate(corpus_names):
        corpus = Corpus(source=config["corpora"][corpus_name],
                        language=languages[i],
                        name=corpus_name)
        # corpus = DataHandler.load_corpus(config["corpora"][corpus_name])
        for d in corpus.get_documents():
            if d.author:
                if isinstance(d.author, float) and np.isnan(d.author):
                    d.author = None
                else:
                    if corpus_name == "bundestag_corpus":
                        authors = [d.author]
                    elif corpus_name == "sustainability_corpus":
                        if isinstance(d.author, str):
                            authors = [a.strip() for a in d.author.split(',')]
                            authors = [
                                f'{j}. {i}'
                                for i, j in zip(authors[::2], authors[1::2])
                            ]
                        else:
                            authors = d.author
                    else:
                        if d.language != "English":
                            wlc += 1
                            continue
                        if isinstance(d.author, str):
                            authors = [a.strip() for a in d.author.split(',')]
                            authors = [
                                f'{j}. {i}'
                                for i, j in zip(authors[::2], authors[1::2])
                            ]
                        else:
                            authors = d.author
                        if len(authors) > 1:
                            m_a += 1
                            print(d.author, authors)
                        else:
                            s_a += 1
                    d.author = authors

        if not overwrite:
            os.rename(src=config["corpora"][corpus_name],
                      dst=create_new_filepath_uncleaned(
                          config["corpora"][corpus_name]))

        corpus.save_corpus(config["corpora"][corpus_name])
    print(wlc, m_a, s_a)
def count_non_years(corpus: Corpus):
    without_year = [d for d in corpus.get_documents() if d.date is None]
    print(
        len([
            d.date for d in corpus.get_documents()
            if d.date and len(str(d.date)) != 4
        ]))
    with_year = [d for d in corpus.get_documents() if d.date]
    print(f'{len(without_year)} / {len(with_year)}')
def build_word2vec():

    print("Tokens: ", Corpus(CORPUS_FILE).count_tokens())

    w2v = Word2Vec(Corpus(CORPUS_FILE),
                   size=100,
                   window=5,
                   min_count=1,
                   workers=4)
    w2v.save(MODEL_FILE)
Beispiel #11
0
def main(args):
    # Set the random seed manually for reproducibility.
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)
    else:
        print("Note that our pre-trained models require CUDA to evaluate.")

    ###########################################################################
    # Load the models
    ###########################################################################

    ae_args, gan_args, idx2word, autoencoder, gan_gen, gan_disc \
        = load_models(args.ae_args, args.gan_args, args.vocab_file,
                      args.ae_model, args.g_model, args.d_model)

    ###########################################################################
    # Generation code
    ###########################################################################

    # Generate sentences
    corpus = Corpus(args.data_path, args.dict_file, vocab_size=len(idx2word))

    source, _ = next(BatchGen(corpus.get_chunks(size=2), args.ngenerations))
    prev_sent = [
        decode_idx(corpus.dictionary, sent) for sent in source.tolist()
    ]
    source = Variable(source, volatile=True)
    sentences = generate(autoencoder,
                         gan_gen,
                         inp=source,
                         vocab=idx2word,
                         sample=args.sample,
                         maxlen=args.maxlen)

    if not args.noprint:
        print("\nSentence generations:\n")
        for prev, sent in zip(prev_sent, sentences):
            print(prev)
            print("    ", sent)
            print("")
    with open(args.outf, "w") as f:
        f.write("Sentence generations:\n\n")
        for prev, sent in zip(prev_sent, sentences):
            f.write(prev + '\n')
            f.write("-> " + sent + '\n\n')
Beispiel #12
0
def main():
    # prepare corpus
    corpus = Corpus(args.data_file, args.dict_file, vocab_size=args.vocab_size)

    # dumping vocabulary
    with open(os.path.join(out_dir, 'vocab.json'), 'w') as f:
        json.dump(corpus.dictionary.word2idx, f)

    # save arguments
    ntokens = len(corpus.dictionary.word2idx)
    args.ntokens = ntokens
    with open(os.path.join(out_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f)
    log.info('[Data Loaded.]')

    autoencoder = AutoEncoder()

    if args.split:
        train, valid = corpus.get_data(split=args.split)
        valid = batchify(valid, args.batch_size, shuffle=False)
    else:
        train = corpus.get_data()

    for epoch in range(1, args.epochs + 1):
        # shuffle train data in each epoch
        batches = batchify(train, args.batch_size, shuffle=True)

        global_iters = 0
        start_time = datetime.now()

        for i, batch in enumerate(batches):
            loss = autoencoder.update(batch)
            if i % args.log_interval == 0 and i > 0:
                log.info(('[Epoch {} {}/{} Loss {:.5f} ETA {}]').format(
                    epoch, i, len(batches), loss,
                    str((datetime.now() - start_time) / (i + 1) *
                        (len(batches) - i - 1)).split('.')[0]))

            global_iters += 1
            if global_iters % 100 == 0:
                autoencoder.anneal()

        if args.split:
            word_acc, sent_acc = autoencoder.evaluate(valid)
            msg = 'Epoch {} word acc: {} | sent acc: {}'.format(
                epoch, word_acc, sent_acc)
            log.warn(msg)
        autoencoder.save(out_dir, 'autoencoder_model_{}.pt'.format(epoch))
def eval_acc(net, device, b_size=10):
    corpus = Corpus('mr', test=True)
    sum_all = 0
    sum_count = 0
    A_E = 1
    for e in range(A_E):
        i = 0
        b = 0
        A_B = int(len(corpus.xs) // b_size)

        while i >= 0:
            x, y = batchify(corpus, i, b_size)
            if x is None:
                i = 1
                break
            x, y = x.to(device), y.to(device)
            out = net(x)
            sum_all += (1 - (y ^ torch.argmax(out, dim=-1))).sum().to(
                torch.device('cpu'))
            sum_count += y.shape[0]
            i += b_size
            b += 1
            # if (b % 50 == 0):
            #     print('E: {}/{} | B: {}/{}'.format(e, A_E, b, A_B))
    return float(sum_all) / float(sum_count)
def cleaning_punctuation(config, overwrite=False):
    corpus_names = [
        "bundestag_corpus", "sustainability_corpus", "abstract_corpus"
    ]
    languages = [Language.DE, Language.EN, Language.EN]
    for i, corpus_name in enumerate(corpus_names):
        corpus = Corpus(source=config["corpora"][corpus_name],
                        language=languages[i],
                        name=corpus_name)
        remove_punctuation(corpus)

        if not overwrite:
            os.rename(src=config["corpora"][corpus_name],
                      dst=create_new_filepath_uncleaned(
                          config["corpora"][corpus_name]))

        corpus.save_corpus(config["corpora"][corpus_name])
def evaluate(
    model_path,
    corpus_path,
    pairs_path,
    batch_size=100,
):

    model = torch.load(model_path)
    model = model.cuda()
    model.eval()

    corpus = Corpus([tuple([corpus_path, os.path.dirname(corpus_path)])])
    pairs_batch_loader = FileLoader(
        [tuple([pairs_path, os.path.dirname(pairs_path)])], batch_size)

    code = []
    nl = []

    for data in tqdm.tqdm(pairs_batch_loader):
        data = map(corpus.get, data)
        batch = (make_batch(model.embedding_layer, data[0][0]),
                 make_batch(model.embedding_layer, data[1][0]))
        batch = [x.cuda() for x in batch]
        batch = (Variable(batch[0],
                          volatile=True), Variable(batch[1], volatile=True))

        # embed code and NL
        repr_left = model(batch[0])
        repr_right = model(batch[1])
        # accumulate for evaluation
        code.extend(repr_left.cpu().data.numpy())
        nl.extend(repr_right.cpu().data.numpy())

    code = np.array(code)
    nl = np.array(nl)

    sim_mat = cosine_similarity(nl, code)
    ans_locs = location_of_correct(sim_mat)

    summary = {}
    mr = np.mean(ans_locs)
    mrr = get_mrr(ans_locs)
    summary["mrr"] = mrr

    cutoffs = [1, 5, 10]
    fracs = []

    for c in cutoffs:
        frac = get_fraction_correct_at(ans_locs, c)
        fracs.append(frac)
    print("Num obs: {}".format(code.shape[0]))
    print("Mean Rank: {}".format(mr))
    print("MRR: {}".format(mrr))

    for c, f in zip(cutoffs, fracs):
        print("Fraction Correct@{}: {}".format(c, f))
        summary["success@{}".format(c)] = f
    return summary
Beispiel #16
0
def main():
    parser = argparse.ArgumentParser(
        description='Extracts keywords for given algorithm on given corpora')
    parser.add_argument('-a',
                        '--algorithm',
                        help='Algorithm to use like rake or tfidf',
                        default="rake")
    parser.add_argument('-c',
                        '--corpora',
                        help='Corpora to annotate as list',
                        nargs='+',
                        default=['state_of_the_union'])
    parser.add_argument('-t',
                        '--translate',
                        help='Translate keywords',
                        action='store_true')
    args = vars(parser.parse_args())

    config = ConfigLoader.get_config()

    #  remove and use actual args
    chosen_corpora = [
        # 'state_of_the_union',
        'bundestag',
        'abstract',
        'sustainability'
    ]  # args['corpora']

    PathMetaData = namedtuple('PathMetaData', 'path corpus_name language')
    paths_and_meta_data = [
        PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag",
                     Language.DE),
        PathMetaData(config["corpora"]["abstract_corpus"], "abstract",
                     Language.EN),
        PathMetaData(config["corpora"]["sustainability_corpus"],
                     "sustainability", Language.EN),
        PathMetaData(config["corpora"]["state_of_the_union_corpus"],
                     "state_of_the_union", Language.EN),
        PathMetaData(config["corpora"]["united_nations_corpus"],
                     "united_nations", Language.EN)
    ]
    paths_and_meta_data = [
        path_meta for path_meta in paths_and_meta_data
        if path_meta.corpus_name in chosen_corpora
    ]

    corpora = [
        Corpus(source=path_meta.path,
               name=path_meta.corpus_name,
               language=path_meta.language)
        for path_meta in paths_and_meta_data
    ]

    for corpus, path_meta in zip(corpora, paths_and_meta_data):
        corpus.save_corpus_without_text(modify_path(path_meta.path))
    def topical_page_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}
        # define the grammar for selecting the keyphrase candidates
        grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

        # 1. create a TopicalPageRank extractor.
        extractor = pke.unsupervised.TopicalPageRank()

        if corpus.language == Language.DE:
            lan = "de"
        else:
            lan = "en"

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating Topical PageRank"):

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            # 3. select the noun phrases as keyphrase candidates.
            extractor.candidate_selection(grammar=grammar)

            # 4. weight the keyphrase candidates using Single Topical PageRank.
            #    Builds a word-graph in which edges connecting two words occurring
            #    in a window are weighted by co-occurrence counts.
            extractor.candidate_weighting(
                window=10, pos=pos,
                lda_model='path/to/lda_model')  # todo: find model

            # 5. get the 10-highest scored candidates as keyphrases
            # 5. get the 10-highest scored candidates as keyphrases
            # keyphrases = extractor.get_n_best(n=top_k)
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases},
            #                        keyword_type=KeywordType.TOPICAL_PAGE_RANK_PKE)
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.TOPICAL_PAGE_RANK_PKE)
Beispiel #18
0
def encode(content, word_delimiter="|", tag_delimiter="/", num_step=60):
    # Create corpus instance
    corpus = Corpus(word_delimiter=word_delimiter, tag_delimiter=tag_delimiter)

    # Add text to corpus
    corpus.add_text(content)

    # Create index for character and tag
    char_index = index_builder(constant.CHARACTER_LIST,
                               constant.CHAR_START_INDEX)
    tag_index = index_builder(constant.TAG_LIST, constant.TAG_START_INDEX)

    # Generate input
    inb = InputBuilder(corpus, char_index, tag_index, num_step, y_one_hot=False)

    # Display encoded content
    np.set_printoptions(threshold=np.inf)
    print("[Input]")
    print(inb.x)
    print("[Label]")
    print(inb.y)
    def yake_pke(cls, corpus: Corpus):
        # 1. create a YAKE extractor.
        extractor = pke.unsupervised.YAKE()

        if corpus.language == Language.DE:
            lan = "de"
            stop_list = stopwords.words('german')
        else:
            lan = "en"
            stop_list = stopwords.words('english')

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating YAKE"):
            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            # 3. select {1-3}-grams not containing punctuation marks and not
            #    beginning/ending with a stopword as candidates.
            extractor.candidate_selection(n=3, stoplist=stop_list)

            # 4. weight the candidates using YAKE weighting scheme, a window (in
            #    words) for computing left/right contexts can be specified.
            window = 2
            extractor.candidate_weighting(window=window,
                                          stoplist=stop_list,
                                          use_stems=True)

            # 5. get the 10-highest scored candidates as keyphrases.
            #    redundant keyphrases are removed from the output using levenshtein
            #    distance and a threshold.
            threshold = 0.8
            # keyphrases = extractor.get_n_best(n=top_k, threshold=threshold)
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.YAKE_PKE)
            keywords[document.doc_id] = extractor.get_n_best(
                n=cls.top_k, threshold=threshold)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.YAKE_PKE)
    def position_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}
        # define the grammar for selecting the keyphrase candidates
        grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

        # 1. create a PositionRank extractor.
        extractor = pke.unsupervised.PositionRank()

        if corpus.language == Language.DE:
            lan = "de"
        else:
            lan = "en"

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating PositionRank"):

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            # 3. select the noun phrases up to 3 words as keyphrase candidates.
            extractor.candidate_selection(grammar=grammar,
                                          maximum_word_number=3)

            # 4. weight the candidates using the sum of their word's scores that are
            #    computed using random walk biaised with the position of the words
            #    in the document. In the graph, nodes are words (nouns and
            #    adjectives only) that are connected if they occur in a window of
            #    10 words.
            extractor.candidate_weighting(window=10, pos=pos)

            # 5. get the 10-highest scored candidates as keyphrases
            # 5. get the 10-highest scored candidates as keyphrases
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.POSITION_RANK_PKE)
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.POSITION_RANK_PKE)
def main():
    parser = argparse.ArgumentParser(
        description='Extracts keywords for given algorithm on given corpora')
    parser.add_argument('-c',
                        '--corpora',
                        help='Corpora to annotate as list',
                        nargs='+',
                        default=['bundestag', 'abstract'])
    parser.add_argument('-t',
                        '--translate',
                        help='Translate keywords',
                        action='store_true')
    args = vars(parser.parse_args())

    config = ConfigLoader.get_config()

    chosen_corpora = args['corpora']

    PathMetaData = namedtuple('PathMetaData', 'path corpus_name language')
    paths_and_meta_data = [
        PathMetaData(config["corpora"]["state_of_the_union_corpus"],
                     "state_of_the_union", Language.EN),
        PathMetaData(config["corpora"]["bundestag_corpus"], "bundestag",
                     Language.DE),
        PathMetaData(config["corpora"]["abstract_corpus"], "abstract",
                     Language.EN),
        PathMetaData(config["corpora"]["sustainability_corpus"],
                     "sustainability", Language.EN),
        PathMetaData(config["corpora"]["united_nations_corpus"],
                     "united_nations", Language.EN)
    ]

    paths_and_meta_data = [
        path_meta for path_meta in paths_and_meta_data
        if path_meta.corpus_name in chosen_corpora
    ]

    print(f'Yearwise of {chosen_corpora}')

    corpora = [
        Corpus(source=path_meta.path,
               name=path_meta.corpus_name,
               language=path_meta.language)
        for path_meta in paths_and_meta_data
    ]

    corpora = [corpus.year_wise_pseudo_documents() for corpus in corpora]

    for corpus, path_meta in zip(corpora, paths_and_meta_data):
        corpus.save_corpus(modify_path(path_meta.path))
        corpus.save_corpus_without_text(
            modify_path(path_meta.path, without_text=True))
    def topic_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}

        # 1. create a TopicRank extractor.
        extractor = pke.unsupervised.TopicRank()

        if corpus.language == Language.DE:
            lan = "de"
            stop_list = stopwords.words('german')
        else:
            lan = "en"
            stop_list = stopwords.words('english')

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating TopicRank"):

            stop_list += list(string.punctuation)
            stop_list += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            extractor.candidate_selection(pos=pos, stoplist=stop_list)

            # 4. build topics by grouping candidates with HAC (average linkage,
            #    threshold of 1/4 of shared stems). Weight the topics using random
            #    walk, and select the first occuring candidate from each topic.
            extractor.candidate_weighting(threshold=0.74, method='average')

            # 5. get the 10-highest scored candidates as keyphrases
            # keyphrases = extractor.get_n_best(n=top_k)
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.TOPIC_RANK_PKE)
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.TOPIC_RANK_PKE)
Beispiel #23
0
def build_phrase_model():

    phrase_list = load_phrases()

    phrases = Phrases(Corpus(CORPUS_FILE))
    bigrams = Phraser(phrases)

    bigrams.save(MODEL_FILE)

    years = Corpus(CORPUS_FILE).get_years()
    authors = Corpus(CORPUS_FILE).get_authors()

    with open(OUT_FILE, "w") as f:
        for i, line in tqdm(enumerate(bigrams[Corpus(CORPUS_FILE)])):

            line = remove_under(line)
            line = check_phrase_list(phrase_list, line)

            line = [authors[i]] + line
            line = [years[i]] + line

            f.write("{}\n".format(" ".join(remove_under(line))))
    def multipartite_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}

        # 1. create a MultipartiteRank extractor.
        extractor = pke.unsupervised.MultipartiteRank()

        if corpus.language == "German":
            lan = "de"
            stop_list = stopwords.words('german')
        else:
            lan = "en"
            stop_list = stopwords.words('english')

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating MultipartiteRank"):

            stop_list += list(string.punctuation)
            stop_list += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            extractor.candidate_selection(pos=pos, stoplist=stop_list)

            # 4. build the Multipartite graph and rank candidates using random walk,
            #    alpha controls the weight adjustment mechanism, see TopicRank for
            #    threshold/method parameters.
            extractor.candidate_weighting(alpha=1.1,
                                          threshold=0.74,
                                          method='average')

            # 5. get the 10-highest scored candidates as keyphrases
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.MULTIPARTITE_RANK_PKE)
Beispiel #25
0
def main(args):

    ###########################################################################
    # Load the models
    ###########################################################################

    model_args, idx2word, autoencoder, inverter, gan_gen, gan_disc = \
        load_models(args.load_path)

    # Set the random seed manually for reproducibility.
    random.seed(model_args['seed'])
    np.random.seed(model_args['seed'])
    torch.manual_seed(model_args['seed'])
    if torch.cuda.is_available():
        torch.cuda.manual_seed(model_args['seed'])
    else:
        print("Note that our pre-trained models require CUDA to evaluate.")

    ###########################################################################
    # Load data
    ###########################################################################

    corpus = Corpus(model_args['data_path'],
                    maxlen=model_args['maxlen'],
                    vocab_size=model_args['vocab_size'],
                    lowercase=model_args['lowercase'])
    if args.test:
        eval_batch_size = 1
        test_data = batchify(corpus.test, eval_batch_size, shuffle=False)
    else:
        train_data = batchify(corpus.train, model_args['batch_size'], shuffle=True)

    print("Loaded data!")

    ###########################################################################
    # Perturbations
    ###########################################################################

    ring_rng = np.linspace(0., 1., 100)
    n_rng = len(test_data) if args.test else len(train_data)

    for idx in range(n_rng):
        data_batch = test_data[idx] if args.test else train_data[idx]

        for l, r in zip(ring_rng, ring_rng[1:]):

            flg = perturb(data_batch, autoencoder, idx2word,
                          model_args['sample'], model_args['maxlen'],
                          left=l, right=r, n_samples=5, epoch=idx,
                          gpu=model_args['cuda'])
            if flg: break
    def tfidf_pke(cls, corpus: Corpus):
        stop_list = list(string.punctuation)
        # 1. create a TfIdf extractor.
        extractor = pke.unsupervised.TfIdf()
        # 2. load the content of the document.

        if corpus.language == Language.DE:
            lan = "de"
        else:
            lan = "en"
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating TF-IDF PKE"):
            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")
            # 3. select {1-3}-grams not containing punctuation marks as candidates.
            # must link spacy languages to language code
            extractor.candidate_selection(n=3, stoplist=stop_list)

            # pke.compute_document_frequency(input_dir='/path/to/collection/of/documents/',
            #                                output_file='output.tsv.gz',
            #                                extension='xml',
            #                                language='en',
            #                                normalization="lemmatization",
            #                                stoplist=stop_list)
            #
            # # 4. weight the candidates using a `tf` x `idf`
            # df = pke.load_document_frequency_file(input_file='output.tsv.gz')
            #
            # extractor.candidate_weighting(df=df)
            extractor.candidate_weighting()
            # 5. get the 10-highest scored candidates as keyphrases
            # keyphrases = extractor.get_n_best(n=top_k)
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.TFIDF_PKE)
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.TFIDF_PKE)
def cleaning_un(config, overwrite=True):
    corpus = Corpus(source=config["corpora"]["united_nations_corpus"],
                    language=Language.DE,
                    name="united_nations_corpus")
    corpus = Corpus(source=[d for d in corpus.get_documents() if d.date],
                    language=corpus.language,
                    name=corpus.name)
    print("1", len(corpus))
    for d in corpus.get_documents():
        d.date = int(d.date)
    print("2", len(corpus))

    if not overwrite:
        os.rename(src=config["corpora"]["united_nations_corpus"],
                  dst=create_new_filepath_uncleaned(
                      config["corpora"]["united_nations_corpus"]))

    corpus.save_corpus(config["corpora"]["united_nations_corpus"])
def cleaning_bundestag(config, overwrite=True):
    corpus = Corpus(source=config["corpora"]["bundestag_corpus"],
                    language=Language.DE,
                    name="bundestag_corpus")
    # corpus = DataHandler.load_corpus(config["corpora"]["bundestag_corpus"])
    corpus = Corpus(source=[d for d in corpus.get_documents() if d.date],
                    language=corpus.language,
                    name=corpus.name)
    print("1", len(corpus))
    for d in corpus.get_documents():
        d.date = int(d.date)
    print("2", len(corpus))

    if not overwrite:
        os.rename(src=config["corpora"]["bundestag_corpus"],
                  dst=create_new_filepath_uncleaned(
                      config["corpora"]["bundestag_corpus"]))

    corpus.save_corpus(config["corpora"]["bundestag_corpus"])
def cleaning_abstracts(config, overwrite=True):
    corpus = Corpus(source=config["corpora"]["abstract_corpus"],
                    language=Language.EN,
                    name="abstract_corpus")
    # corpus = DataHandler.load_corpus(config["corpora"]["abstract_corpus"])
    print("1", len(corpus))
    corpus = Corpus([
        d for d in corpus.get_documents()
        if d.date and len(str(d.date)) == 4 and d.date.isnumeric()
    ],
                    name=corpus.name,
                    language=Language.EN)
    for d in corpus.get_documents():
        d.date = int(d.date)
    print("2", len(corpus))

    if not overwrite:
        os.rename(src=config["corpora"]["abstract_corpus"],
                  dst=create_new_filepath_uncleaned(
                      config["corpora"]["abstract_corpus"]))

    corpus.save_corpus(config["corpora"]["abstract_corpus"])
Beispiel #30
0
def data_generator(args):
    file, testfile, valfile = getattr(observations, args.dataset)('data/')
    file, testfile, valfile = file.replace(
        '<eos>', chr(255)
    ), testfile.replace('<eos>', chr(255)), valfile.replace(
        '<eos>', chr(255)
    )  # Just replace <eos> with another unusual alphabet here (that is not in PTB)
    file_len = len(file)
    valfile_len = len(valfile)
    testfile_len = len(testfile)

    ############################################################
    # Use the following if you want to pickle the loaded data

    pickle_name = "{0}.corpus".format(args.dataset)
    if os.path.exists(pickle_name):
        print("Loading cached data...")
        corpus = pickle.load(open(pickle_name, 'rb'))
    else:
        corpus = Corpus(file + " " + valfile + " " + testfile)
        pickle.dump(corpus, open(pickle_name, 'wb'))
    ############################################################

    return file, file_len, valfile, valfile_len, testfile, testfile_len, corpus