def tfidf_skl(cls, corpus: Corpus): if corpus.language == Language.EN: stop_words = stopwords.words("english") elif corpus.language == Language.DE: stop_words = stopwords.words("german") else: raise UserWarning("No stopwords for language!") tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(cls.min_nrgam, cls.max_ngram), min_df=2) tfidf_matrix = tfidf_vectorizer.fit_transform( [document.text for document in corpus.get_documents(as_list=True)]) doc_id_lookup = { i: document.doc_id for i, document in enumerate(corpus.get_documents(as_list=True)) } features = tfidf_vectorizer.get_feature_names() keywords = {} for i, doc in tqdm(enumerate(tfidf_matrix), desc="Calculating tf-idf", total=tfidf_matrix.shape[0]): df = pd.DataFrame(doc.T.todense(), index=features, columns=["tfidf"]) top_key_words = df.sort_values(by=["tfidf"], ascending=False)[:cls.top_k] keywords[doc_id_lookup[i]] = list(top_key_words.index) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TFIDF_SKL)
def count_non_years(corpus: Corpus): without_year = [d for d in corpus.get_documents() if d.date is None] print( len([ d.date for d in corpus.get_documents() if d.date and len(str(d.date)) != 4 ])) with_year = [d for d in corpus.get_documents() if d.date] print(f'{len(without_year)} / {len(with_year)}')
def single_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a SingleRank extractor. extractor = pke.unsupervised.SingleRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating SingleRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select the longest sequences of nouns and adjectives as candidates. extractor.candidate_selection(pos=pos) # 4. weight the candidates using the sum of their word's scores that are # computed using random walk. In the graph, nodes are words of # certain part-of-speech (nouns and adjectives) that are connected if # they occur in a window of 10 words. extractor.candidate_weighting(window=10, pos=pos) # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.SINGLE_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.SINGLE_RANK_PKE)
def text_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a TextRank extractor. extractor = pke.unsupervised.TextRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating TextRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. build the graph representation of the document and rank the words. # Keyphrase candidates are composed from the 33-percent # highest-ranked words. extractor.candidate_weighting(window=2, pos=pos, top_percent=0.33) # 4. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TEXT_RANK_PKE)
def parse_and_preprocess_src(data_source, corpus_destination, preprocess=True): if re.search("bundestag", data_source.lower()): name = "bundestag" raw_corpus = DataHandler.get_bundestag_speeches(directory=data_source) elif re.search("sustainability", data_source.lower()): name = "sustainability" raw_corpus = DataHandler.get_sustainability_data(path=data_source) elif re.search("unv1.0-tei", data_source.lower()): name = "united_nations" raw_corpus = DataHandler.get_un_texts(directory=data_source) elif re.search("state_of_the_union", data_source.lower()): name = "state_of_the_union" raw_corpus = DataHandler.get_state_of_the_union(directory=data_source) else: name = "abstracts" raw_corpus = DataHandler.get_abstracts(path=data_source) language = raw_corpus[0].language print('loaded', len(raw_corpus), 'documents') if preprocess: Preprocessor.preprocess(raw_corpus, language=language) print('preprocessed', len(raw_corpus), 'documents') corpus = Corpus(source=raw_corpus, language=language, name=name) print('parsed', len(corpus.get_documents(as_list=True)), 'documents to a Corpus') corpus.save_corpus(corpus_destination)
def yearwise_documents(corpus: Corpus, aggregation_func: Callable = len, printing: bool = False, as_dict: bool = False): year_bins = defaultdict(list) for doc in corpus.get_documents(): year_bins[doc.date].append(doc) result = { year: aggregation_func( Corpus(source=docs, language=corpus.language, name=f'{corpus.name}_yearwise')) for year, docs in year_bins.items() if year is not None } result = OrderedDict(sorted(result.items())) if as_dict: return result years = [] counts = [] for year, count in result.items(): years.append(year) counts.append(count) if printing: print(f'{year}: {count}') # print(years) # print(counts) return years, counts
def cleaning_un(config, overwrite=True): corpus = Corpus(source=config["corpora"]["united_nations_corpus"], language=Language.DE, name="united_nations_corpus") corpus = Corpus(source=[d for d in corpus.get_documents() if d.date], language=corpus.language, name=corpus.name) print("1", len(corpus)) for d in corpus.get_documents(): d.date = int(d.date) print("2", len(corpus)) if not overwrite: os.rename(src=config["corpora"]["united_nations_corpus"], dst=create_new_filepath_uncleaned( config["corpora"]["united_nations_corpus"])) corpus.save_corpus(config["corpora"]["united_nations_corpus"])
def cleaning_authors(config, overwrite=False): corpus_names = [ "bundestag_corpus", # "sustainability_corpus", # "abstract_corpus" ] languages = [Language.DE, Language.EN, Language.EN] wlc = 0 m_a = 0 s_a = 0 for i, corpus_name in enumerate(corpus_names): corpus = Corpus(source=config["corpora"][corpus_name], language=languages[i], name=corpus_name) # corpus = DataHandler.load_corpus(config["corpora"][corpus_name]) for d in corpus.get_documents(): if d.author: if isinstance(d.author, float) and np.isnan(d.author): d.author = None else: if corpus_name == "bundestag_corpus": authors = [d.author] elif corpus_name == "sustainability_corpus": if isinstance(d.author, str): authors = [a.strip() for a in d.author.split(',')] authors = [ f'{j}. {i}' for i, j in zip(authors[::2], authors[1::2]) ] else: authors = d.author else: if d.language != "English": wlc += 1 continue if isinstance(d.author, str): authors = [a.strip() for a in d.author.split(',')] authors = [ f'{j}. {i}' for i, j in zip(authors[::2], authors[1::2]) ] else: authors = d.author if len(authors) > 1: m_a += 1 print(d.author, authors) else: s_a += 1 d.author = authors if not overwrite: os.rename(src=config["corpora"][corpus_name], dst=create_new_filepath_uncleaned( config["corpora"][corpus_name])) corpus.save_corpus(config["corpora"][corpus_name]) print(wlc, m_a, s_a)
def cleaning_bundestag(config, overwrite=True): corpus = Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.DE, name="bundestag_corpus") # corpus = DataHandler.load_corpus(config["corpora"]["bundestag_corpus"]) corpus = Corpus(source=[d for d in corpus.get_documents() if d.date], language=corpus.language, name=corpus.name) print("1", len(corpus)) for d in corpus.get_documents(): d.date = int(d.date) print("2", len(corpus)) if not overwrite: os.rename(src=config["corpora"]["bundestag_corpus"], dst=create_new_filepath_uncleaned( config["corpora"]["bundestag_corpus"])) corpus.save_corpus(config["corpora"]["bundestag_corpus"])
def cleaning_abstracts(config, overwrite=True): corpus = Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="abstract_corpus") # corpus = DataHandler.load_corpus(config["corpora"]["abstract_corpus"]) print("1", len(corpus)) corpus = Corpus([ d for d in corpus.get_documents() if d.date and len(str(d.date)) == 4 and d.date.isnumeric() ], name=corpus.name, language=Language.EN) for d in corpus.get_documents(): d.date = int(d.date) print("2", len(corpus)) if not overwrite: os.rename(src=config["corpora"]["abstract_corpus"], dst=create_new_filepath_uncleaned( config["corpora"]["abstract_corpus"])) corpus.save_corpus(config["corpora"]["abstract_corpus"])
def topical_page_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # define the grammar for selecting the keyphrase candidates grammar = "NP: {<ADJ>*<NOUN|PROPN>+}" # 1. create a TopicalPageRank extractor. extractor = pke.unsupervised.TopicalPageRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating Topical PageRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select the noun phrases as keyphrase candidates. extractor.candidate_selection(grammar=grammar) # 4. weight the keyphrase candidates using Single Topical PageRank. # Builds a word-graph in which edges connecting two words occurring # in a window are weighted by co-occurrence counts. extractor.candidate_weighting( window=10, pos=pos, lda_model='path/to/lda_model') # todo: find model # 5. get the 10-highest scored candidates as keyphrases # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, # keyword_type=KeywordType.TOPICAL_PAGE_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TOPICAL_PAGE_RANK_PKE)
def yake_pke(cls, corpus: Corpus): # 1. create a YAKE extractor. extractor = pke.unsupervised.YAKE() if corpus.language == Language.DE: lan = "de" stop_list = stopwords.words('german') else: lan = "en" stop_list = stopwords.words('english') # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating YAKE"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select {1-3}-grams not containing punctuation marks and not # beginning/ending with a stopword as candidates. extractor.candidate_selection(n=3, stoplist=stop_list) # 4. weight the candidates using YAKE weighting scheme, a window (in # words) for computing left/right contexts can be specified. window = 2 extractor.candidate_weighting(window=window, stoplist=stop_list, use_stems=True) # 5. get the 10-highest scored candidates as keyphrases. # redundant keyphrases are removed from the output using levenshtein # distance and a threshold. threshold = 0.8 # keyphrases = extractor.get_n_best(n=top_k, threshold=threshold) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.YAKE_PKE) keywords[document.doc_id] = extractor.get_n_best( n=cls.top_k, threshold=threshold) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.YAKE_PKE)
def position_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # define the grammar for selecting the keyphrase candidates grammar = "NP: {<ADJ>*<NOUN|PROPN>+}" # 1. create a PositionRank extractor. extractor = pke.unsupervised.PositionRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating PositionRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select the noun phrases up to 3 words as keyphrase candidates. extractor.candidate_selection(grammar=grammar, maximum_word_number=3) # 4. weight the candidates using the sum of their word's scores that are # computed using random walk biaised with the position of the words # in the document. In the graph, nodes are words (nouns and # adjectives only) that are connected if they occur in a window of # 10 words. extractor.candidate_weighting(window=10, pos=pos) # 5. get the 10-highest scored candidates as keyphrases # 5. get the 10-highest scored candidates as keyphrases # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.POSITION_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.POSITION_RANK_PKE)
def topic_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a TopicRank extractor. extractor = pke.unsupervised.TopicRank() if corpus.language == Language.DE: lan = "de" stop_list = stopwords.words('german') else: lan = "en" stop_list = stopwords.words('english') # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating TopicRank"): stop_list += list(string.punctuation) stop_list += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] extractor.load_document(input=document.text, language=lan, normalization="lemmatization") extractor.candidate_selection(pos=pos, stoplist=stop_list) # 4. build topics by grouping candidates with HAC (average linkage, # threshold of 1/4 of shared stems). Weight the topics using random # walk, and select the first occuring candidate from each topic. extractor.candidate_weighting(threshold=0.74, method='average') # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.TOPIC_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TOPIC_RANK_PKE)
def multipartite_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a MultipartiteRank extractor. extractor = pke.unsupervised.MultipartiteRank() if corpus.language == "German": lan = "de" stop_list = stopwords.words('german') else: lan = "en" stop_list = stopwords.words('english') # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating MultipartiteRank"): stop_list += list(string.punctuation) stop_list += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] extractor.load_document(input=document.text, language=lan, normalization="lemmatization") extractor.candidate_selection(pos=pos, stoplist=stop_list) # 4. build the Multipartite graph and rank candidates using random walk, # alpha controls the weight adjustment mechanism, see TopicRank for # threshold/method parameters. extractor.candidate_weighting(alpha=1.1, threshold=0.74, method='average') # 5. get the 10-highest scored candidates as keyphrases keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.MULTIPARTITE_RANK_PKE)
def tfidf_pke(cls, corpus: Corpus): stop_list = list(string.punctuation) # 1. create a TfIdf extractor. extractor = pke.unsupervised.TfIdf() # 2. load the content of the document. if corpus.language == Language.DE: lan = "de" else: lan = "en" keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating TF-IDF PKE"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select {1-3}-grams not containing punctuation marks as candidates. # must link spacy languages to language code extractor.candidate_selection(n=3, stoplist=stop_list) # pke.compute_document_frequency(input_dir='/path/to/collection/of/documents/', # output_file='output.tsv.gz', # extension='xml', # language='en', # normalization="lemmatization", # stoplist=stop_list) # # # 4. weight the candidates using a `tf` x `idf` # df = pke.load_document_frequency_file(input_file='output.tsv.gz') # # extractor.candidate_weighting(df=df) extractor.candidate_weighting() # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.TFIDF_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TFIDF_PKE)
def main(): # load configuration parameters from config file config = ConfigLoader.get_config() # corpus = Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="abstract_corpus") # corpus = Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.DE, name="bundestag_corpus") corpus = Corpus(source=config["corpora"]["sustainability_corpus"], language=Language.EN, name="sustainability_corpus") # print(len(corpus)) # test = DocumentsFilter.filter(corpus, has_tags=['test']) # print(set([x.tags for x in test])) # print(len(test)) # # exit(0) corpus = corpus.get_n_documents_as_corpus(n=100) # build yearwise pseudo documents pseudo_corpus = corpus.year_wise_pseudo_documents() # extract keywords KeyPhraseExtractor.tfidf_skl(corpus=pseudo_corpus) print([d.keywords for d in pseudo_corpus.get_documents()]) KeyPhraseExtractor.rake(corpus=corpus) print([d.keywords for d in corpus.get_documents()]) # key_words_post = Document.group_keywords_year_wise(corpus) # key_words_pre = Document.transform_pseudo_docs_keywords_to_dict(KeyPhraseExtractor.rake(documents=pseudo_corpus)) # print(KeyPhraseExtractor.get_top_k_keywords(key_words_post_group, 10)) # print(KeyPhraseExtractor.get_top_k_keywords(key_words_pre_group, 10)) # format: {year->list fo keywords} kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"]) counter = 0 for doc in corpus.get_documents(): for keyword in doc.keywords: if counter > 100: break kwt.translate(keyword) print(keyword) counter += 1 break print('extracting keywords with rake ...') rake_keywords = KeyPhraseExtractor.rake(corpus=corpus.get_documents()[0]) rake_keywords_keys = list(rake_keywords.keys()) print('rake keywords dict keys:', rake_keywords_keys) kwt = KeywordTranslator(cache_file=config["translator"]["cache_file"]) list_of_keywords = [] for k in rake_keywords[rake_keywords_keys[0]]: kw = Keyword(german_translation=k, keyword_type=KeywordType.RAKE) kwt.translate(kw) list_of_keywords.append(kw) print('{} \t {} \t\t\t {}'.format(kw.source_language, kw.english_translation, kw.german_translation))
def remove_punctuation(corpus: Corpus): for d in corpus.get_documents(): res = re.sub(r"[^a-zA-ZäöüÖÄÜß\-\s.!?]", '', d.text) res = re.sub(r" +", ' ', res) d.text = res