def extract_keywords(target_word, word_clustered_data, max_df, topn): lemmatizer = Lemmatizer('en') l_sent_clust_dict = defaultdict(list) sent_clust_dict = defaultdict(list) for i, row in word_clustered_data.iterrows(): l_sent_clust_dict[row['label']].append(row['sentence']) for label, sents in l_sent_clust_dict.items(): sent_clust_dict[label] = " ".join(sents) stop1 = list(spacy.lang.en.stop_words.STOP_WORDS) stop2 = stopwords.words('english') stop = set(stop1 + stop2) labels, clusters = list(sent_clust_dict.keys()), list( sent_clust_dict.values()) tfidf_transformer = TfidfVectorizer(smooth_idf=True, use_idf=True, ngram_range=(1, 2), max_df=max_df, stop_words=stop, max_features=10000) tfidf_transformer.fit(clusters) feature_names = tfidf_transformer.get_feature_names() keyword_clusters = {} for label, cluster in zip(labels, clusters): # generate tf-idf tf_idf_vector = tfidf_transformer.transform([cluster]) # sort the tf-idf vectors by descending order of scores tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data) sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) # extract only the top n keywords = extract_topn_from_vector(feature_names, sorted_items, topn * 5) keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True) keywords = [x[0] for x in keywords] #filter unigrams that appear in bigrams and remove duplicates all_bigrams = " ".join([kw for kw in keywords if len(kw.split()) == 2]) already_in = set() filtered_keywords = [] for kw in keywords: if len(kw.split()) == 1 and kw in all_bigrams: continue else: if len(kw.split()) == 1: kw = lemmatizer.lemmatize(kw) if kw not in already_in and kw != target_word: filtered_keywords.append(kw) already_in.add(kw) keyword_clusters[label] = filtered_keywords[:topn] return keyword_clusters
class LemmagenLemmatizer(BaseNormalizer): name = 'Lemmagen Lemmatizer' lemmagen_languages = { "Bulgarian": "bg", "Croatian": "hr", "Czech": "cs", "English": "en", "Estonian": "et", "Farsi/Persian": "fa", "French": "fr", "German": "de", "Hungarian": "hu", "Italian": "it", "Macedonian": "mk", "Polish": "pl", "Romanian": "ro", "Russian": "ru", "Serbian": "sr", "Slovak": "sk", "Slovenian": "sl", "Spanish": "es", "Ukrainian": "uk" } def __init__(self, language='English'): super().__init__() self.language = language self.lemmatizer = None def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: # lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language]) output_corpus = super().__call__(corpus, callback) self.lemmatizer = None return output_corpus def normalizer(self, token): assert self.lemmatizer is not None t = self.lemmatizer.lemmatize(token) # sometimes Lemmagen returns an empty string, return original tokens # in this case return t if t else token
from lemmagen3 import Lemmatizer print(Lemmatizer.list_supported_languages()) a = Lemmatizer('en') word = 'cats' print('{}->{}'.format(word, a.lemmatize(word))) b = Lemmatizer('sl') word = 'ljudje' print('{}->{}'.format(word, b.lemmatize(word)))
from lemmagen3 import Lemmatizer # first, list all supported languages print(Lemmatizer.list_supported_languages()) # then, create few lemmatizer objects using ISO 639-1 language codes # (English, Slovene and Russian) lem_en = Lemmatizer('en') lem_sl = Lemmatizer('sl') lem_ru = Lemmatizer('ru') # now lemmatize the word "cats" in all three languages print(lem_en.lemmatize('cats')) print(lem_sl.lemmatize('je')) print(lem_ru.lemmatize('коты'))