def tfidf_skl(cls, corpus: Corpus):
        if corpus.language == Language.EN:
            stop_words = stopwords.words("english")
        elif corpus.language == Language.DE:
            stop_words = stopwords.words("german")
        else:
            raise UserWarning("No stopwords for language!")

        tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,
                                           ngram_range=(cls.min_nrgam,
                                                        cls.max_ngram),
                                           min_df=2)
        tfidf_matrix = tfidf_vectorizer.fit_transform(
            [document.text for document in corpus.get_documents(as_list=True)])
        doc_id_lookup = {
            i: document.doc_id
            for i, document in enumerate(corpus.get_documents(as_list=True))
        }

        features = tfidf_vectorizer.get_feature_names()

        keywords = {}
        for i, doc in tqdm(enumerate(tfidf_matrix),
                           desc="Calculating tf-idf",
                           total=tfidf_matrix.shape[0]):
            df = pd.DataFrame(doc.T.todense(),
                              index=features,
                              columns=["tfidf"])
            top_key_words = df.sort_values(by=["tfidf"],
                                           ascending=False)[:cls.top_k]
            keywords[doc_id_lookup[i]] = list(top_key_words.index)

        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.TFIDF_SKL)
def count_non_years(corpus: Corpus):
    without_year = [d for d in corpus.get_documents() if d.date is None]
    print(
        len([
            d.date for d in corpus.get_documents()
            if d.date and len(str(d.date)) != 4
        ]))
    with_year = [d for d in corpus.get_documents() if d.date]
    print(f'{len(without_year)} / {len(with_year)}')
    def single_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}
        # 1. create a SingleRank extractor.
        extractor = pke.unsupervised.SingleRank()

        if corpus.language == Language.DE:
            lan = "de"
        else:
            lan = "en"

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating SingleRank"):

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            # 3. select the longest sequences of nouns and adjectives as candidates.
            extractor.candidate_selection(pos=pos)

            # 4. weight the candidates using the sum of their word's scores that are
            #    computed using random walk. In the graph, nodes are words of
            #    certain part-of-speech (nouns and adjectives) that are connected if
            #    they occur in a window of 10 words.
            extractor.candidate_weighting(window=10, pos=pos)

            # 5. get the 10-highest scored candidates as keyphrases
            # keyphrases = extractor.get_n_best(n=top_k)
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.SINGLE_RANK_PKE)
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.SINGLE_RANK_PKE)
    def text_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}
        # 1. create a TextRank extractor.
        extractor = pke.unsupervised.TextRank()

        if corpus.language == Language.DE:
            lan = "de"
        else:
            lan = "en"

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating TextRank"):

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            # 3. build the graph representation of the document and rank the words.
            #    Keyphrase candidates are composed from the 33-percent
            #    highest-ranked words.
            extractor.candidate_weighting(window=2, pos=pos, top_percent=0.33)

            # 4. get the 10-highest scored candidates as keyphrases
            # keyphrases = extractor.get_n_best(n=top_k)

            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)

        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.TEXT_RANK_PKE)
def parse_and_preprocess_src(data_source, corpus_destination, preprocess=True):
    if re.search("bundestag", data_source.lower()):
        name = "bundestag"
        raw_corpus = DataHandler.get_bundestag_speeches(directory=data_source)
    elif re.search("sustainability", data_source.lower()):
        name = "sustainability"
        raw_corpus = DataHandler.get_sustainability_data(path=data_source)
    elif re.search("unv1.0-tei", data_source.lower()):
        name = "united_nations"
        raw_corpus = DataHandler.get_un_texts(directory=data_source)
    elif re.search("state_of_the_union", data_source.lower()):
        name = "state_of_the_union"
        raw_corpus = DataHandler.get_state_of_the_union(directory=data_source)
    else:
        name = "abstracts"
        raw_corpus = DataHandler.get_abstracts(path=data_source)

    language = raw_corpus[0].language
    print('loaded', len(raw_corpus), 'documents')
    if preprocess:
        Preprocessor.preprocess(raw_corpus, language=language)
        print('preprocessed', len(raw_corpus), 'documents')
    corpus = Corpus(source=raw_corpus, language=language, name=name)
    print('parsed', len(corpus.get_documents(as_list=True)),
          'documents to a Corpus')
    corpus.save_corpus(corpus_destination)
def yearwise_documents(corpus: Corpus,
                       aggregation_func: Callable = len,
                       printing: bool = False,
                       as_dict: bool = False):
    year_bins = defaultdict(list)

    for doc in corpus.get_documents():
        year_bins[doc.date].append(doc)

    result = {
        year: aggregation_func(
            Corpus(source=docs,
                   language=corpus.language,
                   name=f'{corpus.name}_yearwise'))
        for year, docs in year_bins.items() if year is not None
    }
    result = OrderedDict(sorted(result.items()))

    if as_dict:
        return result

    years = []
    counts = []
    for year, count in result.items():
        years.append(year)
        counts.append(count)
        if printing:
            print(f'{year}: {count}')

    # print(years)
    # print(counts)
    return years, counts
def cleaning_un(config, overwrite=True):
    corpus = Corpus(source=config["corpora"]["united_nations_corpus"],
                    language=Language.DE,
                    name="united_nations_corpus")
    corpus = Corpus(source=[d for d in corpus.get_documents() if d.date],
                    language=corpus.language,
                    name=corpus.name)
    print("1", len(corpus))
    for d in corpus.get_documents():
        d.date = int(d.date)
    print("2", len(corpus))

    if not overwrite:
        os.rename(src=config["corpora"]["united_nations_corpus"],
                  dst=create_new_filepath_uncleaned(
                      config["corpora"]["united_nations_corpus"]))

    corpus.save_corpus(config["corpora"]["united_nations_corpus"])
def cleaning_authors(config, overwrite=False):
    corpus_names = [
        "bundestag_corpus",
        # "sustainability_corpus",
        # "abstract_corpus"
    ]
    languages = [Language.DE, Language.EN, Language.EN]
    wlc = 0
    m_a = 0
    s_a = 0
    for i, corpus_name in enumerate(corpus_names):
        corpus = Corpus(source=config["corpora"][corpus_name],
                        language=languages[i],
                        name=corpus_name)
        # corpus = DataHandler.load_corpus(config["corpora"][corpus_name])
        for d in corpus.get_documents():
            if d.author:
                if isinstance(d.author, float) and np.isnan(d.author):
                    d.author = None
                else:
                    if corpus_name == "bundestag_corpus":
                        authors = [d.author]
                    elif corpus_name == "sustainability_corpus":
                        if isinstance(d.author, str):
                            authors = [a.strip() for a in d.author.split(',')]
                            authors = [
                                f'{j}. {i}'
                                for i, j in zip(authors[::2], authors[1::2])
                            ]
                        else:
                            authors = d.author
                    else:
                        if d.language != "English":
                            wlc += 1
                            continue
                        if isinstance(d.author, str):
                            authors = [a.strip() for a in d.author.split(',')]
                            authors = [
                                f'{j}. {i}'
                                for i, j in zip(authors[::2], authors[1::2])
                            ]
                        else:
                            authors = d.author
                        if len(authors) > 1:
                            m_a += 1
                            print(d.author, authors)
                        else:
                            s_a += 1
                    d.author = authors

        if not overwrite:
            os.rename(src=config["corpora"][corpus_name],
                      dst=create_new_filepath_uncleaned(
                          config["corpora"][corpus_name]))

        corpus.save_corpus(config["corpora"][corpus_name])
    print(wlc, m_a, s_a)
def cleaning_bundestag(config, overwrite=True):
    corpus = Corpus(source=config["corpora"]["bundestag_corpus"],
                    language=Language.DE,
                    name="bundestag_corpus")
    # corpus = DataHandler.load_corpus(config["corpora"]["bundestag_corpus"])
    corpus = Corpus(source=[d for d in corpus.get_documents() if d.date],
                    language=corpus.language,
                    name=corpus.name)
    print("1", len(corpus))
    for d in corpus.get_documents():
        d.date = int(d.date)
    print("2", len(corpus))

    if not overwrite:
        os.rename(src=config["corpora"]["bundestag_corpus"],
                  dst=create_new_filepath_uncleaned(
                      config["corpora"]["bundestag_corpus"]))

    corpus.save_corpus(config["corpora"]["bundestag_corpus"])
def cleaning_abstracts(config, overwrite=True):
    corpus = Corpus(source=config["corpora"]["abstract_corpus"],
                    language=Language.EN,
                    name="abstract_corpus")
    # corpus = DataHandler.load_corpus(config["corpora"]["abstract_corpus"])
    print("1", len(corpus))
    corpus = Corpus([
        d for d in corpus.get_documents()
        if d.date and len(str(d.date)) == 4 and d.date.isnumeric()
    ],
                    name=corpus.name,
                    language=Language.EN)
    for d in corpus.get_documents():
        d.date = int(d.date)
    print("2", len(corpus))

    if not overwrite:
        os.rename(src=config["corpora"]["abstract_corpus"],
                  dst=create_new_filepath_uncleaned(
                      config["corpora"]["abstract_corpus"]))

    corpus.save_corpus(config["corpora"]["abstract_corpus"])
    def topical_page_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}
        # define the grammar for selecting the keyphrase candidates
        grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

        # 1. create a TopicalPageRank extractor.
        extractor = pke.unsupervised.TopicalPageRank()

        if corpus.language == Language.DE:
            lan = "de"
        else:
            lan = "en"

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating Topical PageRank"):

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            # 3. select the noun phrases as keyphrase candidates.
            extractor.candidate_selection(grammar=grammar)

            # 4. weight the keyphrase candidates using Single Topical PageRank.
            #    Builds a word-graph in which edges connecting two words occurring
            #    in a window are weighted by co-occurrence counts.
            extractor.candidate_weighting(
                window=10, pos=pos,
                lda_model='path/to/lda_model')  # todo: find model

            # 5. get the 10-highest scored candidates as keyphrases
            # 5. get the 10-highest scored candidates as keyphrases
            # keyphrases = extractor.get_n_best(n=top_k)
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases},
            #                        keyword_type=KeywordType.TOPICAL_PAGE_RANK_PKE)
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.TOPICAL_PAGE_RANK_PKE)
    def yake_pke(cls, corpus: Corpus):
        # 1. create a YAKE extractor.
        extractor = pke.unsupervised.YAKE()

        if corpus.language == Language.DE:
            lan = "de"
            stop_list = stopwords.words('german')
        else:
            lan = "en"
            stop_list = stopwords.words('english')

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating YAKE"):
            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            # 3. select {1-3}-grams not containing punctuation marks and not
            #    beginning/ending with a stopword as candidates.
            extractor.candidate_selection(n=3, stoplist=stop_list)

            # 4. weight the candidates using YAKE weighting scheme, a window (in
            #    words) for computing left/right contexts can be specified.
            window = 2
            extractor.candidate_weighting(window=window,
                                          stoplist=stop_list,
                                          use_stems=True)

            # 5. get the 10-highest scored candidates as keyphrases.
            #    redundant keyphrases are removed from the output using levenshtein
            #    distance and a threshold.
            threshold = 0.8
            # keyphrases = extractor.get_n_best(n=top_k, threshold=threshold)
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.YAKE_PKE)
            keywords[document.doc_id] = extractor.get_n_best(
                n=cls.top_k, threshold=threshold)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.YAKE_PKE)
    def position_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}
        # define the grammar for selecting the keyphrase candidates
        grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

        # 1. create a PositionRank extractor.
        extractor = pke.unsupervised.PositionRank()

        if corpus.language == Language.DE:
            lan = "de"
        else:
            lan = "en"

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating PositionRank"):

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            # 3. select the noun phrases up to 3 words as keyphrase candidates.
            extractor.candidate_selection(grammar=grammar,
                                          maximum_word_number=3)

            # 4. weight the candidates using the sum of their word's scores that are
            #    computed using random walk biaised with the position of the words
            #    in the document. In the graph, nodes are words (nouns and
            #    adjectives only) that are connected if they occur in a window of
            #    10 words.
            extractor.candidate_weighting(window=10, pos=pos)

            # 5. get the 10-highest scored candidates as keyphrases
            # 5. get the 10-highest scored candidates as keyphrases
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.POSITION_RANK_PKE)
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.POSITION_RANK_PKE)
    def topic_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}

        # 1. create a TopicRank extractor.
        extractor = pke.unsupervised.TopicRank()

        if corpus.language == Language.DE:
            lan = "de"
            stop_list = stopwords.words('german')
        else:
            lan = "en"
            stop_list = stopwords.words('english')

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating TopicRank"):

            stop_list += list(string.punctuation)
            stop_list += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            extractor.candidate_selection(pos=pos, stoplist=stop_list)

            # 4. build topics by grouping candidates with HAC (average linkage,
            #    threshold of 1/4 of shared stems). Weight the topics using random
            #    walk, and select the first occuring candidate from each topic.
            extractor.candidate_weighting(threshold=0.74, method='average')

            # 5. get the 10-highest scored candidates as keyphrases
            # keyphrases = extractor.get_n_best(n=top_k)
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.TOPIC_RANK_PKE)
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.TOPIC_RANK_PKE)
    def multipartite_rank_pke(cls, corpus: Corpus):
        # define the set of valid Part-of-Speeches
        pos = {'NOUN', 'PROPN', 'ADJ'}

        # 1. create a MultipartiteRank extractor.
        extractor = pke.unsupervised.MultipartiteRank()

        if corpus.language == "German":
            lan = "de"
            stop_list = stopwords.words('german')
        else:
            lan = "en"
            stop_list = stopwords.words('english')

        # 2. load the content of the document.
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating MultipartiteRank"):

            stop_list += list(string.punctuation)
            stop_list += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']

            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")

            extractor.candidate_selection(pos=pos, stoplist=stop_list)

            # 4. build the Multipartite graph and rank candidates using random walk,
            #    alpha controls the weight adjustment mechanism, see TopicRank for
            #    threshold/method parameters.
            extractor.candidate_weighting(alpha=1.1,
                                          threshold=0.74,
                                          method='average')

            # 5. get the 10-highest scored candidates as keyphrases
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.MULTIPARTITE_RANK_PKE)
    def tfidf_pke(cls, corpus: Corpus):
        stop_list = list(string.punctuation)
        # 1. create a TfIdf extractor.
        extractor = pke.unsupervised.TfIdf()
        # 2. load the content of the document.

        if corpus.language == Language.DE:
            lan = "de"
        else:
            lan = "en"
        keywords = {}
        for document in tqdm(corpus.get_documents(as_list=True),
                             desc="Calculating TF-IDF PKE"):
            extractor.load_document(input=document.text,
                                    language=lan,
                                    normalization="lemmatization")
            # 3. select {1-3}-grams not containing punctuation marks as candidates.
            # must link spacy languages to language code
            extractor.candidate_selection(n=3, stoplist=stop_list)

            # pke.compute_document_frequency(input_dir='/path/to/collection/of/documents/',
            #                                output_file='output.tsv.gz',
            #                                extension='xml',
            #                                language='en',
            #                                normalization="lemmatization",
            #                                stoplist=stop_list)
            #
            # # 4. weight the candidates using a `tf` x `idf`
            # df = pke.load_document_frequency_file(input_file='output.tsv.gz')
            #
            # extractor.candidate_weighting(df=df)
            extractor.candidate_weighting()
            # 5. get the 10-highest scored candidates as keyphrases
            # keyphrases = extractor.get_n_best(n=top_k)
            # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.TFIDF_PKE)
            keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k)
        corpus.assign_keywords(keywords=keywords,
                               keyword_type=KeywordType.TFIDF_PKE)
def main():
    # load configuration parameters from config file
    config = ConfigLoader.get_config()

    # corpus = Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="abstract_corpus")
    # corpus = Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.DE, name="bundestag_corpus")
    corpus = Corpus(source=config["corpora"]["sustainability_corpus"],
                    language=Language.EN,
                    name="sustainability_corpus")

    # print(len(corpus))
    # test = DocumentsFilter.filter(corpus, has_tags=['test'])
    # print(set([x.tags for x in test]))
    # print(len(test))
    #
    # exit(0)

    corpus = corpus.get_n_documents_as_corpus(n=100)

    # build yearwise pseudo documents

    pseudo_corpus = corpus.year_wise_pseudo_documents()
    # extract keywords
    KeyPhraseExtractor.tfidf_skl(corpus=pseudo_corpus)
    print([d.keywords for d in pseudo_corpus.get_documents()])

    KeyPhraseExtractor.rake(corpus=corpus)
    print([d.keywords for d in corpus.get_documents()])
    # key_words_post = Document.group_keywords_year_wise(corpus)
    # key_words_pre = Document.transform_pseudo_docs_keywords_to_dict(KeyPhraseExtractor.rake(documents=pseudo_corpus))

    # print(KeyPhraseExtractor.get_top_k_keywords(key_words_post_group, 10))
    # print(KeyPhraseExtractor.get_top_k_keywords(key_words_pre_group, 10))
    # format: {year->list fo keywords}

    kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"])

    counter = 0
    for doc in corpus.get_documents():
        for keyword in doc.keywords:
            if counter > 100:
                break
            kwt.translate(keyword)
            print(keyword)
            counter += 1
        break

    print('extracting keywords with rake ...')
    rake_keywords = KeyPhraseExtractor.rake(corpus=corpus.get_documents()[0])
    rake_keywords_keys = list(rake_keywords.keys())
    print('rake keywords dict keys:', rake_keywords_keys)

    kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"])
    list_of_keywords = []

    for k in rake_keywords[rake_keywords_keys[0]]:
        kw = Keyword(german_translation=k, keyword_type=KeywordType.RAKE)
        kwt.translate(kw)
        list_of_keywords.append(kw)
        print('{} \t {} \t\t\t {}'.format(kw.source_language,
                                          kw.english_translation,
                                          kw.german_translation))
def remove_punctuation(corpus: Corpus):
    for d in corpus.get_documents():
        res = re.sub(r"[^a-zA-ZäöüÖÄÜß\-\s.!?]", '', d.text)
        res = re.sub(r" +", ' ', res)
        d.text = res