def tfidf_skl(cls, corpus: Corpus): if corpus.language == Language.EN: stop_words = stopwords.words("english") elif corpus.language == Language.DE: stop_words = stopwords.words("german") else: raise UserWarning("No stopwords for language!") tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(cls.min_nrgam, cls.max_ngram), min_df=2) tfidf_matrix = tfidf_vectorizer.fit_transform( [document.text for document in corpus.get_documents(as_list=True)]) doc_id_lookup = { i: document.doc_id for i, document in enumerate(corpus.get_documents(as_list=True)) } features = tfidf_vectorizer.get_feature_names() keywords = {} for i, doc in tqdm(enumerate(tfidf_matrix), desc="Calculating tf-idf", total=tfidf_matrix.shape[0]): df = pd.DataFrame(doc.T.todense(), index=features, columns=["tfidf"]) top_key_words = df.sort_values(by=["tfidf"], ascending=False)[:cls.top_k] keywords[doc_id_lookup[i]] = list(top_key_words.index) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TFIDF_SKL)
def single_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a SingleRank extractor. extractor = pke.unsupervised.SingleRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating SingleRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select the longest sequences of nouns and adjectives as candidates. extractor.candidate_selection(pos=pos) # 4. weight the candidates using the sum of their word's scores that are # computed using random walk. In the graph, nodes are words of # certain part-of-speech (nouns and adjectives) that are connected if # they occur in a window of 10 words. extractor.candidate_weighting(window=10, pos=pos) # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.SINGLE_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.SINGLE_RANK_PKE)
def text_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a TextRank extractor. extractor = pke.unsupervised.TextRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating TextRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. build the graph representation of the document and rank the words. # Keyphrase candidates are composed from the 33-percent # highest-ranked words. extractor.candidate_weighting(window=2, pos=pos, top_percent=0.33) # 4. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TEXT_RANK_PKE)
def topical_page_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # define the grammar for selecting the keyphrase candidates grammar = "NP: {<ADJ>*<NOUN|PROPN>+}" # 1. create a TopicalPageRank extractor. extractor = pke.unsupervised.TopicalPageRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating Topical PageRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select the noun phrases as keyphrase candidates. extractor.candidate_selection(grammar=grammar) # 4. weight the keyphrase candidates using Single Topical PageRank. # Builds a word-graph in which edges connecting two words occurring # in a window are weighted by co-occurrence counts. extractor.candidate_weighting( window=10, pos=pos, lda_model='path/to/lda_model') # todo: find model # 5. get the 10-highest scored candidates as keyphrases # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, # keyword_type=KeywordType.TOPICAL_PAGE_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TOPICAL_PAGE_RANK_PKE)
def position_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # define the grammar for selecting the keyphrase candidates grammar = "NP: {<ADJ>*<NOUN|PROPN>+}" # 1. create a PositionRank extractor. extractor = pke.unsupervised.PositionRank() if corpus.language == Language.DE: lan = "de" else: lan = "en" # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating PositionRank"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select the noun phrases up to 3 words as keyphrase candidates. extractor.candidate_selection(grammar=grammar, maximum_word_number=3) # 4. weight the candidates using the sum of their word's scores that are # computed using random walk biaised with the position of the words # in the document. In the graph, nodes are words (nouns and # adjectives only) that are connected if they occur in a window of # 10 words. extractor.candidate_weighting(window=10, pos=pos) # 5. get the 10-highest scored candidates as keyphrases # 5. get the 10-highest scored candidates as keyphrases # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.POSITION_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.POSITION_RANK_PKE)
def yake_pke(cls, corpus: Corpus): # 1. create a YAKE extractor. extractor = pke.unsupervised.YAKE() if corpus.language == Language.DE: lan = "de" stop_list = stopwords.words('german') else: lan = "en" stop_list = stopwords.words('english') # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating YAKE"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select {1-3}-grams not containing punctuation marks and not # beginning/ending with a stopword as candidates. extractor.candidate_selection(n=3, stoplist=stop_list) # 4. weight the candidates using YAKE weighting scheme, a window (in # words) for computing left/right contexts can be specified. window = 2 extractor.candidate_weighting(window=window, stoplist=stop_list, use_stems=True) # 5. get the 10-highest scored candidates as keyphrases. # redundant keyphrases are removed from the output using levenshtein # distance and a threshold. threshold = 0.8 # keyphrases = extractor.get_n_best(n=top_k, threshold=threshold) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.YAKE_PKE) keywords[document.doc_id] = extractor.get_n_best( n=cls.top_k, threshold=threshold) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.YAKE_PKE)
def multipartite_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a MultipartiteRank extractor. extractor = pke.unsupervised.MultipartiteRank() if corpus.language == "German": lan = "de" stop_list = stopwords.words('german') else: lan = "en" stop_list = stopwords.words('english') # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating MultipartiteRank"): stop_list += list(string.punctuation) stop_list += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] extractor.load_document(input=document.text, language=lan, normalization="lemmatization") extractor.candidate_selection(pos=pos, stoplist=stop_list) # 4. build the Multipartite graph and rank candidates using random walk, # alpha controls the weight adjustment mechanism, see TopicRank for # threshold/method parameters. extractor.candidate_weighting(alpha=1.1, threshold=0.74, method='average') # 5. get the 10-highest scored candidates as keyphrases keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.MULTIPARTITE_RANK_PKE)
def topic_rank_pke(cls, corpus: Corpus): # define the set of valid Part-of-Speeches pos = {'NOUN', 'PROPN', 'ADJ'} # 1. create a TopicRank extractor. extractor = pke.unsupervised.TopicRank() if corpus.language == Language.DE: lan = "de" stop_list = stopwords.words('german') else: lan = "en" stop_list = stopwords.words('english') # 2. load the content of the document. keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating TopicRank"): stop_list += list(string.punctuation) stop_list += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] extractor.load_document(input=document.text, language=lan, normalization="lemmatization") extractor.candidate_selection(pos=pos, stoplist=stop_list) # 4. build topics by grouping candidates with HAC (average linkage, # threshold of 1/4 of shared stems). Weight the topics using random # walk, and select the first occuring candidate from each topic. extractor.candidate_weighting(threshold=0.74, method='average') # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.TOPIC_RANK_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TOPIC_RANK_PKE)
def tfidf_pke(cls, corpus: Corpus): stop_list = list(string.punctuation) # 1. create a TfIdf extractor. extractor = pke.unsupervised.TfIdf() # 2. load the content of the document. if corpus.language == Language.DE: lan = "de" else: lan = "en" keywords = {} for document in tqdm(corpus.get_documents(as_list=True), desc="Calculating TF-IDF PKE"): extractor.load_document(input=document.text, language=lan, normalization="lemmatization") # 3. select {1-3}-grams not containing punctuation marks as candidates. # must link spacy languages to language code extractor.candidate_selection(n=3, stoplist=stop_list) # pke.compute_document_frequency(input_dir='/path/to/collection/of/documents/', # output_file='output.tsv.gz', # extension='xml', # language='en', # normalization="lemmatization", # stoplist=stop_list) # # # 4. weight the candidates using a `tf` x `idf` # df = pke.load_document_frequency_file(input_file='output.tsv.gz') # # extractor.candidate_weighting(df=df) extractor.candidate_weighting() # 5. get the 10-highest scored candidates as keyphrases # keyphrases = extractor.get_n_best(n=top_k) # corpus.assign_keywords(keywords={document.doc_id: keyphrases}, keyword_type=KeywordType.TFIDF_PKE) keywords[document.doc_id] = extractor.get_n_best(n=cls.top_k) corpus.assign_keywords(keywords=keywords, keyword_type=KeywordType.TFIDF_PKE)