Ejemplo n.º 1
0
def create_trigram_finder(tokenized_docs, should_filter=False):
    if should_filter:
        trigrams_data_samples = [trigram_prep(doc) for doc in tokenized_docs]
    else:
        trigrams_data_samples = tokenized_docs
    trigrams_finder = TrigramCollocationFinder.from_documents(trigrams_data_samples)
    return trigrams_finder
Ejemplo n.º 2
0
def get_top_trigrams(corpus, top_n=100):
    '''
    Most frequent tri-gram detection
    '''

    finder = TrigramCollocationFinder.from_documents(
        [item.split() for item in corpus])
    trigram_measures = TrigramAssocMeasures()
    return finder.nbest(trigram_measures.raw_freq, top_n)
Ejemplo n.º 3
0
def print_top_trig_collocs(word, pd_series, tokenizer, frac_corpus = 0.1, stopwords = gen_stop_words):
    corpus = [tokenizer.tokenize(x) for x in pd_series.to_list()]
    finder = TrigramCollocationFinder.from_documents(corpus)
    finder.apply_freq_filter(round(frac_corpus*len(pd_series)))
    main_trigrams = finder.nbest(trigram_measures.likelihood_ratio, 100000)
    for trigram in main_trigrams:
        if word in trigram:
            print(trigram)
        
    return
Ejemplo n.º 4
0
def retrieve_top_trigrams_collocations(corpus, top=5, measure='pmi'):
    finder = TrigramCollocationFinder.from_documents(
        [item.split() for item in corpus])
    trigram_measures = TrigramAssocMeasures()

    if measure == 'pmi':
        top_trigrams = finder.nbest(trigram_measures.pmi, top)
    elif measure == 'frequency':
        top_trigrams = finder.nbest(trigram_measures.raw_freq, top)
    else:
        raise ValueError('Type of measure is unknown!')

    return top_trigrams
Ejemplo n.º 5
0
def compute_ngrams_count(text_corpus, out_p, n=20):
    print("Compute ngrams count...")
    list_of_tokens = []
    for document in text_corpus:
        for sentence in document:
            list_of_tokens.append(word_tokenize(sentence))

    # Unigram
    tokens = util.flatten_one_level(list_of_tokens)
    custom_sw = [".", "[", "]", ","]
    sw = stopwords.words("english") + custom_sw
    tokens = [w for w in tokens if w not in sw]
    word_fd = FreqDist(tokens)
    uni_mc = word_fd.most_common(n)

    # Bigram
    bi = BigramCollocationFinder.from_documents(list_of_tokens)
    #bi.apply_freq_filter(2)
    #print(bi.ngram_fd.items())
    bi_mc = bi.ngram_fd.most_common(n)

    # Trigram
    tri = TrigramCollocationFinder.from_documents(list_of_tokens)
    tri_mc = tri.ngram_fd.most_common(n)

    # Quadgram
    quad = QuadgramCollocationFinder.from_documents(list_of_tokens)
    quad_mc = quad.ngram_fd.most_common(n)

    # Plot
    data = [uni_mc, bi_mc, tri_mc, quad_mc]
    x = []
    y = []
    for i in range(len(data)):
        x_ng = []
        y_ng = []
        for d in data[i]:
            if i==0:
                x_ng.append(d[0])
            else:
                x_ng.append(" ".join(d[0]))
            y_ng.append(d[1])
        x.append(x_ng[::-1])
        y.append(y_ng[::-1])
    title = ["Unigram", "Bigram", "Trigram", "Quadgram"]
    sup_title = "ngrams count"
    util.plot_bar_chart_grid(x, y, 1, len(data), title, sup_title, out_p, sup_title_font_size=16,
        tick_font_size=14, title_font_size=14, h_size=5, w_size=5, rotate=True)
get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10)

from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in norm_alice])
bigram_measures = BigramAssocMeasures()
finder.nbest(bigram_measures.raw_freq, 10)
finder.nbest(bigram_measures.pmi, 10)

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents(
    [item.split() for item in norm_alice])
trigram_measures = TrigramAssocMeasures()
finder.nbest(trigram_measures.raw_freq, 10)
finder.nbest(trigram_measures.pmi, 10)

toy_text = """
Elephants are large mammals of the family Elephantidae 
and the order Proboscidea. Two species are traditionally recognised, 
the African elephant and the Asian elephant. Elephants are scattered 
throughout sub-Saharan Africa, South Asia, and Southeast Asia. Male 
African elephants are the largest extant terrestrial animals. All 
elephants have a long trunk used for many purposes, 
particularly breathing, lifting water and grasping objects. Their 
incisors grow into tusks, which can serve as weapons and as tools 
for moving objects and digging. Elephants' large ear flaps help 
to control their body temperature. Their pillar-like legs can 
Ejemplo n.º 7
0
def find_trigrams(sentences, n_ngrams):
    cf = TrigramCollocationFinder.from_documents(sentences)
    fng = cf.nbest(TrigramAssocMeasures.likelihood_ratio, n_ngrams)
    return fng
Ejemplo n.º 8
0
corpus, category = get_data()

from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
bigram_measures = BigramAssocMeasures()

print finder.nbest(bigram_measures.raw_freq, 10)

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
trigram_measures = TrigramAssocMeasures()

print finder.nbest(trigram_measures.raw_freq, 10)
print finder.nbest(trigram_measures.pmi, 10)

# print get_top_ngrams(corpus, ngram_val=2, limit=10)
from feature_extractors import build_feature_matrix
import networkx
import numpy as np
import matplotlib
from normalization import normalize_corpus

norm = normalize_corpus(corpus)
# construcat weighted document term matrix
vec, dt_matrix = build_feature_matrix(norm, feature_type='tfidf')
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents([item.split() 
                                                for item 
                                                in norm_alice])
bigram_measures = BigramAssocMeasures()                                                
finder.nbest(bigram_measures.raw_freq, 10)
finder.nbest(bigram_measures.pmi, 10)   

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents([item.split() 
                                                for item 
                                                in norm_alice])
trigram_measures = TrigramAssocMeasures()                                                
finder.nbest(trigram_measures.raw_freq, 10)
finder.nbest(trigram_measures.pmi, 10)  


toy_text = """
Elephants are large mammals of the family Elephantidae 
and the order Proboscidea. Two species are traditionally recognised, 
the African elephant and the Asian elephant. Elephants are scattered 
throughout sub-Saharan Africa, South Asia, and Southeast Asia. Male 
African elephants are the largest extant terrestrial animals. All 
elephants have a long trunk used for many purposes, 
particularly breathing, lifting water and grasping objects. Their 
incisors grow into tusks, which can serve as weapons and as tools 
Ejemplo n.º 10
0
def process(text: str,
            num_1_grams: int = 100,
            num_2_grams: int = 100,
            num_3_grams: int = 100,
            num_4_grams: int = 100,
            min_chars: int = 3,
            max_chars: int = 30):
    """ Extract keywords from text sources """

    # Find all sentences in the text
    sents = get_sentences(text)

    # Filter out any sentences which occur identically more than once
    sent_counter = collections.Counter(sents)
    sents = [sent for sent in sents if sent_counter[sent] == 1]

    # Tokenize each sentence
    sents = [RE_TOKEN.split(sent) for sent in sents]

    # and len(word) > 1

    # Filter out non-alphabetic tokens and convert to lowercase
    sents = [[token.lower() for token in sent if is_alpha(token)]
             for sent in sents]

    # We look at two variants of the input sentences
    # a. For 1-grams, we remove all stopwords, short tokens, and possesives
    # b. For 2-grams and longer, we want to keep stopwords and short tokens as
    #    these might provide some information in relation to other words
    sents_a = [[RE_POSS.sub('', t) for t in sent if filter_token(t)]
               for sent in sents]
    sents_b = sents

    assert len(sents_a) > 0 and len(sents_b), 'Not enough words'

    counter = collections.Counter()
    for sent in sents_a:
        for token in sent:
            counter[token] += 1

    res = [[], [], [], []]

    if num_1_grams:
        # Represent tokens using a tuple with only one element to match the
        # format of the other ngrams with n > 1
        tuples = [((token, ), count) for token, count in counter.items()]
        df_1 = df_top(tuples=tuples,
                      num=num_1_grams,
                      token_filter=filter_1_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_1 is not None:
            res[0] = df_1['entry'].tolist()

    if num_2_grams:
        bigrams = BigramCollocationFinder.from_documents(sents_b)
        tuples = bigrams.score_ngrams(BigramAssocMeasures.raw_freq)
        df_2 = df_top(tuples=tuples,
                      num=num_2_grams,
                      token_filter=filter_2_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_2 is not None:
            res[1] = df_2['entry'].tolist()

    if num_3_grams:
        trigrams = TrigramCollocationFinder.from_documents(sents_b)
        tuples = trigrams.score_ngrams(TrigramAssocMeasures.raw_freq)
        df_3 = df_top(tuples=tuples,
                      num=num_3_grams,
                      token_filter=filter_3_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_3 is not None:
            res[2] = df_3['entry'].tolist()

    if num_4_grams:
        quadgrams = QuadgramCollocationFinder.from_documents(sents_b)
        tuples = quadgrams.score_ngrams(QuadgramAssocMeasures.raw_freq)
        df_4 = df_top(tuples=tuples,
                      num=num_4_grams,
                      token_filter=filter_4_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_4 is not None:
            res[3] = df_4['entry'].tolist()

    return res