def extract_keywords(target_word, word_clustered_data, max_df, topn):
    lemmatizer = Lemmatizer('en')
    l_sent_clust_dict = defaultdict(list)
    sent_clust_dict = defaultdict(list)
    for i, row in word_clustered_data.iterrows():
        l_sent_clust_dict[row['label']].append(row['sentence'])

    for label, sents in l_sent_clust_dict.items():
        sent_clust_dict[label] = " ".join(sents)

    stop1 = list(spacy.lang.en.stop_words.STOP_WORDS)
    stop2 = stopwords.words('english')
    stop = set(stop1 + stop2)

    labels, clusters = list(sent_clust_dict.keys()), list(
        sent_clust_dict.values())

    tfidf_transformer = TfidfVectorizer(smooth_idf=True,
                                        use_idf=True,
                                        ngram_range=(1, 2),
                                        max_df=max_df,
                                        stop_words=stop,
                                        max_features=10000)
    tfidf_transformer.fit(clusters)
    feature_names = tfidf_transformer.get_feature_names()

    keyword_clusters = {}
    for label, cluster in zip(labels, clusters):
        # generate tf-idf
        tf_idf_vector = tfidf_transformer.transform([cluster])
        # sort the tf-idf vectors by descending order of scores
        tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data)
        sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
        # extract only the top n
        keywords = extract_topn_from_vector(feature_names, sorted_items,
                                            topn * 5)
        keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)
        keywords = [x[0] for x in keywords]
        #filter unigrams that appear in bigrams and remove duplicates
        all_bigrams = " ".join([kw for kw in keywords if len(kw.split()) == 2])
        already_in = set()
        filtered_keywords = []
        for kw in keywords:
            if len(kw.split()) == 1 and kw in all_bigrams:
                continue
            else:
                if len(kw.split()) == 1:
                    kw = lemmatizer.lemmatize(kw)
                if kw not in already_in and kw != target_word:
                    filtered_keywords.append(kw)
                    already_in.add(kw)

        keyword_clusters[label] = filtered_keywords[:topn]

    return keyword_clusters
Ejemplo n.º 2
0
class LemmagenLemmatizer(BaseNormalizer):
    name = 'Lemmagen Lemmatizer'
    lemmagen_languages = {
        "Bulgarian": "bg",
        "Croatian": "hr",
        "Czech": "cs",
        "English": "en",
        "Estonian": "et",
        "Farsi/Persian": "fa",
        "French": "fr",
        "German": "de",
        "Hungarian": "hu",
        "Italian": "it",
        "Macedonian": "mk",
        "Polish": "pl",
        "Romanian": "ro",
        "Russian": "ru",
        "Serbian": "sr",
        "Slovak": "sk",
        "Slovenian": "sl",
        "Spanish": "es",
        "Ukrainian": "uk"
    }

    def __init__(self, language='English'):
        super().__init__()
        self.language = language
        self.lemmatizer = None

    def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
        # lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward
        self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language])
        output_corpus = super().__call__(corpus, callback)
        self.lemmatizer = None
        return output_corpus

    def normalizer(self, token):
        assert self.lemmatizer is not None
        t = self.lemmatizer.lemmatize(token)
        # sometimes Lemmagen returns an empty string, return original tokens
        # in this case
        return t if t else token
Ejemplo n.º 3
0
from lemmagen3 import Lemmatizer

print(Lemmatizer.list_supported_languages())

a = Lemmatizer('en')
word = 'cats'
print('{}->{}'.format(word, a.lemmatize(word)))

b = Lemmatizer('sl')
word = 'ljudje'
print('{}->{}'.format(word, b.lemmatize(word)))
Ejemplo n.º 4
0
from lemmagen3 import Lemmatizer

# first, list all supported languages
print(Lemmatizer.list_supported_languages())

# then, create few lemmatizer objects using ISO 639-1 language codes
# (English, Slovene and Russian)

lem_en = Lemmatizer('en')
lem_sl = Lemmatizer('sl')
lem_ru = Lemmatizer('ru')

# now lemmatize the word "cats" in all three languages
print(lem_en.lemmatize('cats'))
print(lem_sl.lemmatize('je'))
print(lem_ru.lemmatize('коты'))