Example #1
0
def calc_hapax_index(text: List[str]) -> float:
    """
    Вычисление Гапакс-индекса

    Описание:
        Гапакс - слово, встретившееся в тексте только один раз
        Гапаксы того или иного автора нередко используют для атрибуции ему некоторого другого произведения,
        где встречаются такие слова

    Ссылки:
        https://ru.wikipedia.org/wiki/Гапакс
        https://en.wikipedia.org/wiki/Hapax_legomenon

    Аргументы:
        text (list[str]): Список слов

    Вывод:
        float: Значение индекса
    """
    n_words = len(text)
    n_lexemes = len(set(text))
    num = 100 * log10(n_words)
    freqs = FreqDist(text)
    hapaxes = len(freqs.hapaxes())
    den = 1 - (safe_divide(hapaxes, n_lexemes))
    hapax_index = safe_divide(num, den)
    return hapax_index
Example #2
0
def hapaxes(entrada):
    '''
    Hapaxes são palavras que ocorrem apenas uma vez no texto.
    Essa função é responsável por retornar uma lista dessas palavras.
    '''
    fdist = FreqDist(entrada)
    return fdist.hapaxes()
def load_book_features(file_name):
    with open(file_name, 'r') as file_handler:
        text = file_handler.read()

    morph = pymorphy2.MorphAnalyzer()

    sentence_list = sent_tokenize(text)

    usual_book_words = []
    sentences_length_dist = []
    words_length_dist = []
    pron_dist = []
    conj_dist = []

    for sentence in sentence_list:
        if sentence != ".":
            pron_count = 0
            conj_count = 0
            sentence_words = re.findall(r"[\w]+", sentence)
            sentences_length_dist.append(len(sentence_words))

            for word in sentence_words:
                words_length_dist.append(len(word))
                if word in NOMINATIVE_PRONOUNS:
                    pron_count += 1
                if morph.parse(word)[0].tag.POS == 'CONJ':
                    conj_count += 1
                if word not in STOPWORDS:
                    usual_book_words.append(word)

            conj_dist.append(conj_count)
            pron_dist.append(pron_count)

    sentence_length_freq_dist = FreqDist(sentences_length_dist)
    sentences_length_dist = [sentence_length_freq_dist.freq(i) for i in range(1, RANGE + 1)]
    sentences_length_dist.append(1 - sum(sentences_length_dist))

    words_length_freq_dist = FreqDist(words_length_dist)
    words_length_dist = [words_length_freq_dist.freq(i) for i in range(1, RANGE + 1)]
    words_length_dist.append(1 - sum(words_length_dist))

    pron_freq_dist = FreqDist(pron_dist)
    pron_dist = [pron_freq_dist.freq(i) for i in range(0, RANGE + 1)]
    pron_dist.append(1 - sum(pron_dist))

    conj_freq_dist = FreqDist(conj_dist)
    conj_dist = [conj_freq_dist.freq(i) for i in range(0, RANGE + 1)]
    conj_dist.append(1 - sum(conj_dist))

    words_freq_dist = FreqDist(usual_book_words)

    num_unique_words = len(words_freq_dist.keys())
    num_total_words = len(usual_book_words)

    hapax = len(words_freq_dist.hapaxes()) / num_unique_words
    dis = len([item for item in words_freq_dist if words_freq_dist[item] == 2]) / num_unique_words
    richness = num_unique_words / num_total_words

    return [hapax, dis, richness, *sentences_length_dist, *words_length_dist, *pron_dist, *conj_dist]
Example #4
0
 def hapaxes(self, words=False, filtrate=False):
     '''Метод извлекающий из текста слова-одиночки'''
     if not words:
         # ищем в леммах
         res = self._vocab
     else:
         res = FreqDist(self.words(filtrate=filtrate))
     return res.hapaxes()
 def replaceUnique(self):
     """ Replaces unique words with the UNK label """
     word_frequencies = FreqDist([word for (word, _) in self.tagged_sents])
     self.lexicon_size = len(word_frequencies)
     hap = set(word_frequencies.hapaxes())
     res = [(UNK, tag) if word in hap else (word, tag)
            for (word, tag) in self.tagged_sents]
     self.tagged_sents = res
def processText(text):
    print("Processing...")
    text = '<s> ' + text
    text = text.replace('\r\n', ' </s>\r\n<s> ')
    text = ' </s>\r\n'.join(text.rsplit(' </s>\r\n<s> ', 1))
    textTkns = nltk.word_tokenize(text)
    textFD = FreqDist(textTkns)
    textSingles = textFD.hapaxes()
    for word in textSingles:
        text = text.replace(" " + word + " ", " <UNK> ")
    print("Done")
    return text
Example #7
0
def make_vocabs(normalized_data, is_pickle=True):
    """
	Fungsi untuk melakukan pembuatan bow/vocabulary

	Proses pembuatan vocabs

	vocabs ini digunakan untuk membentuk feature vector dari normalized data
	beberapa perlakukan untuk membentuk vocabs, di antarnya:
	(1) hapus hapax: kata yang hanya muncul sekali dari seluruh corpus
	(2) seleksi hanya kata kerja
	(3) hapus hapax dan gunakan hanya kata dengan panjang > 2 karakter

	return:
	all_words = vocabs/bow hasil

	paramater:
	normalized_data = data text yang sudah dilakukan preprocessing/normalisasi
	"""

    all_words = [
        word for sentence in normalized_data for word in sentence.split()
    ]

    fd = FreqDist(all_words)  # sebelum di-set, bentuk object freqdist

    all_words = list(sorted(set(all_words)))
    print('n fitur awal:\t\t', len(all_words))

    # (1)
    hapaxes = fd.hapaxes()
    # all_words = [word for word in all_words if word not in hapaxes]

    # (2)
    # with open('../experiment/pos_tag_indo.pkl', 'rb') as file:
    #     jj = pickle.load(file)
    # all_words_adj = [word for word in all_words if word in jj]
    # all_words = all_words_adj

    all_words = [
        word for word in all_words if len(word) > 2 and word not in hapaxes
    ]

    file_path = os.getcwd() + '/data/dinamics/vocabs.pkl'

    if is_pickle:
        with open(file_path, 'wb') as data:
            pickle.dump(all_words, data)

    return all_words
Example #8
0
def preprocess(train_data):

    all_words = []
    for article in train_text_data:
        article_words = list(set(article.split(',')))
        for word in article_words:
            all_words.append(word)

    dist = FreqDist(all_words)
    least_frequent_words = dist.hapaxes()
    for word in least_frequent_words:
        if(word in dist):
            del dist[word]

    vocab = set(dist.keys())
    return vocab
Example #9
0
 def train_finder(self, all_listings):
     """
     Train the product identification algorithm with example data.
     """
     logging.info("Start training of recognizer for product: {0}"
                  .format(self.product_id))
     self.classifier = None
     
     #select example listings for the finder's product
     listings, n_pos, n_neg = self.filter_trainig_samples(all_listings)
     logging.info("Number listings: {l}, positive: {p}, negative: {n}; "
                  "features: {f}"
                  .format(l=len(listings), p=n_pos, n=n_neg,
                          f=self.n_features))
     if len(listings) < 30:
         logging.warn("Product {0}. Can't compute classifier. "
                      "Too few listings."
                      .format(self.product_id))
         return
     elif n_pos < 10:
         logging.warn("Product {0}. Can't compute classifier. "
                      "Too few positive listings."
                      .format(self.product_id))
         return
     elif n_neg < 10:
         logging.warn("Product {0}. Can't compute classifier. "
                      "Too few negative listings."
                      .format(self.product_id))
         return
     
     #Create list of most common words, and put it into feature extractor
     #TODO: remove stop-words
     self.feature_extractor = FeatureExtractor()
     word_freqs = FreqDist()
     for _, listing in listings.iterrows():
         words = self.feature_extractor.extract_words(listing)
         word_freqs.update(words)
     common_words = word_freqs.keys()[:self.n_features]
     self.feature_extractor = FeatureExtractor(common_words)
     logging.debug("Number individual words: {0}; hapaxes: {1}"
                   .format(len(word_freqs), len(word_freqs.hapaxes())))
     logging.debug("Most common words: {}".format(word_freqs.keys()[:100]))
     
     #Train the classifier
     train_set = self.create_labeled_features(listings)
     self.classifier = nltk.NaiveBayesClassifier.train(train_set)
     self.classifier.show_most_informative_features(20)
Example #10
0
def getUniqueWords():
    raw_text_all = ''

    for csv in allCsvs:
        filename = 'csvs/' + csv + '.csv'
        df = pd.read_csv(filename, index_col=0)
        for line in df['Lines']:
            raw_text_all += line + '\n'

    tokens = word_tokenize(raw_text_all)
    text = nltk.Text(tokens)
    # text.collocations()

    fdist = FreqDist(text)
    unique = fdist.hapaxes()
    sort_unique = sorted(unique)
    print(sort_unique)
def get_news_features(headline, text):

    nlp = es_core_news_md.load()

    ## headline ##
    headline = re.sub(r"http\S+", "", headline)
    headline = re.sub(r"http", "", headline)
    headline = re.sub(r"@\S+", "", headline)
    headline = re.sub("\n", " ", headline)
    headline = re.sub(r"(?<!\n)\n(?!\n)", " ", headline)
    headline = headline.replace(r"*NUMBER*", "número")
    headline = headline.replace(r"*PHONE*", "número")
    headline = headline.replace(r"*EMAIL*", "email")
    headline = headline.replace(r"*URL*", "url")
    headline_lower = headline.lower()
    doc_h = nlp(headline_lower)

    list_tokens_h = []
    list_tags_h = []

    for sentence_h in doc_h.sents:
        for token in sentence_h:
            list_tokens_h.append(token.text)

    fdist_h = FreqDist(list_tokens_h)
    syllables_h = get_nsyllables(headline)
    words_h = len(list_tokens_h)

    # headline complexity features
    avg_word_size_h = round(
        sum(len(word) for word in list_tokens_h) / words_h, 2)
    avg_syllables_word_h = round(syllables_h / words_h, 2)
    unique_words_h = round((len(fdist_h.hapaxes()) / words_h) * 100, 2)
    mltd_h = round(ld.mtld(list_tokens_h), 2)
    ttr_h = round(ld.ttr(list_tokens_h) * 100, 2)

    ## text content##
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"http", "", text)
    text = re.sub("\n", " ", text)
    text = text.replace(r"*NUMBER*", "número")
    text = text.replace(r"*PHONE*", "número")
    text = text.replace(r"*EMAIL*", "email")
    text = text.replace(r"*URL*", "url")

    # to later calculate upper case letters ratio
    alph = list(filter(str.isalpha, text))
    text_lower = text.lower()
    doc = nlp(text_lower)

    list_tokens = []
    list_pos = []
    list_tag = []
    list_entities = []
    sents = 0

    for entity in doc.ents:
        list_entities.append(entity.label_)

    for sentence in doc.sents:
        sents += 1
        for token in sentence:
            list_tokens.append(token.text)
            list_pos.append(token.pos_)
            list_tag.append(token.tag_)

    # Calculate entities, pos, tag, freq, syllables, words and quotes
    entities = len(list_entities)
    n_pos = nltk.Counter(list_pos)
    n_tag = nltk.Counter(list_tag)
    fdist = FreqDist(list_tokens)
    syllables = get_nsyllables(text)
    words = len(list_tokens)
    quotes = n_tag['PUNCT__PunctType=Quot']

    # complexity features
    avg_word_sentence = round(words / sents, 2)
    avg_word_size = round(sum(len(word) for word in list_tokens) / words, 2)
    avg_syllables_word = round(syllables / words, 2)
    unique_words = round((len(fdist.hapaxes()) / words) * 100, 2)
    ttr = round(ld.ttr(list_tokens) * 100, 2)

    # readability spanish test
    huerta_score = round(
        206.84 - (60 * avg_syllables_word) - (1.02 * avg_word_sentence), 2)
    szigriszt_score = round(
        206.835 - ((62.3 * syllables) / words) - (words / sents), 2)

    # stylometric features
    mltd = round(ld.mtld(list_tokens), 2)
    upper_case_ratio = round(sum(map(str.isupper, alph)) / len(alph) * 100, 2)
    entity_ratio = round((entities / words) * 100, 2)
    quotes_ratio = round((quotes / words) * 100, 2)
    propn_ratio = round((n_pos['PROPN'] / words) * 100, 2)
    noun_ratio = round((n_pos['NOUN'] / words) * 100, 2)
    pron_ratio = round((n_pos['PRON'] / words) * 100, 2)
    adp_ratio = round((n_pos['ADP'] / words) * 100, 2)
    det_ratio = round((n_pos['DET'] / words) * 100, 2)
    punct_ratio = round((n_pos['PUNCT'] / words) * 100, 2)
    verb_ratio = round((n_pos['VERB'] / words) * 100, 2)
    adv_ratio = round((n_pos['ADV'] / words) * 100, 2)
    sym_ratio = round((n_tag['SYM'] / words) * 100, 2)

    # create df_features
    df_features = pd.DataFrame({
        'text': text_lower,
        'headline': headline_lower,
        'words_h': words_h,
        'word_size_h': [avg_word_size_h],
        'avg_syllables_word_h': [avg_syllables_word_h],
        'unique_words_h': [unique_words_h],
        'ttr_h': ttr_h,
        'mltd_h': [mltd_h],
        'sents': sents,
        'words': words,
        'avg_words_sent': [avg_word_sentence],
        'avg_word_size': [avg_word_size],
        'avg_syllables_word': avg_syllables_word,
        'unique_words': [unique_words],
        'ttr': [ttr],
        'huerta_score': [huerta_score],
        'szigriszt_score': [szigriszt_score],
        'mltd': [mltd],
        'upper_case_ratio': [upper_case_ratio],
        'entity_ratio': [entity_ratio],
        'quotes': quotes,
        'quotes_ratio': [quotes_ratio],
        'propn_ratio': [propn_ratio],
        'noun_ratio': [noun_ratio],
        'pron_ratio': [pron_ratio],
        'adp_ratio': [adp_ratio],
        'det_ratio': [det_ratio],
        'punct_ratio': [punct_ratio],
        'verb_ratio': [verb_ratio],
        'adv_ratio': [adv_ratio],
        'sym_ratio': [sym_ratio]
    })

    return df_features
Example #12
0
    # 	print('text_stemmed\n', text_stemmed)
    # 	print()

    # print('texts_normalized', texts_normalized)
    # print()

    all_words = [
        word for sentence in texts_normalized for word in sentence.split()
    ]

    from nltk import FreqDist
    fd = FreqDist(all_words)  # sebelum di-set, bentuk object freqdist

    all_words = list(sorted(set(all_words)))

    hapaxes = fd.hapaxes()

    # print('hapaxes', hapaxes)

    all_words = [
        word for word in all_words if len(word) > 2 and word not in hapaxes
    ]

    # print('features:')
    # print(len(all_words), all_words)

    from vectorizers import binary_vectorizer, count_vectorizer, tfidf_vectorizer
    # biner = binary_vectorizer(texts_normalized, all_words)
    # count = count_vectorizer(texts_normalized, all_words)
    # tfidf = tfidf_vectorizer(texts_normalized, all_words)
Example #13
0
for word in gutenberg.words('austen-persuasion.txt'):
    fd[word] += 1
'''
'''
fd = FreqDist(gutenberg.words('austen-persuasion.txt'))
print fd.N()
print fd.B()
for word in sorted(fd.keys()):
    print word,fd[word]
'''

# text1.concordance("monstrous")
# text4.dispersion_plot(["citizens","democracy","freedom","duties","America"])
# text3.generate()  nltk3不支持


# print len(text3) / len(set(text3))

def lexical_diversity(text):
    return len(text) / len(set(text))


def percentage(count, total):
    return 100 * count / total

fdist1 = FreqDist(text1)
print(fdist1)
print fdist1.most_common(50)
# dist1.plot(50,cumulative=True)
print len(fdist1.hapaxes())
    return 100 * count / total


lexical_diversity(text4)
percentage(text4.count('a'), len(text4))

# Simple statistics
from nltk import FreqDist
# Counting Words Appearing in a Text (a frequency distribution)
fdist1 = FreqDist(text4)
fdist1
vocabulary1 = fdist1.keys()  # list of all the distinct types in the text
vocabulary1[:3]  # look at first 3

#words that occur only once, called hapaxes
fdist1.hapaxes()[:20]

# Words that meet a condition, are long for example
V = set(text4)
long_words = [w for w in V if len(w) > 15]
sorted(long_words)

#finding words that characterize a text, relatively long, and occur frequently
fdist = FreqDist(text4)
sorted([w for w in set(text4) if len(w) > 7 and fdist[w] > 7])

# Collocations and Bigrams.
# A collocation is a sequence of words that occur together unusually often.
# Built in collocations function
text4.collocations()
#print(books.fileids())
text1 = books.words(fileids=['essays_and_wisdom_of_the_ancients.txt'])
text2 = books.words(fileids=['new_atlantis.txt'])
text3 = books.words(fileids=['novum_organum.txt'])
text4 = books.words(fileids=['of_gardens.txt'])
text5 = books.words(fileids=['shakespeare.txt'])
text6 = books.words(fileids=['the_advancement_of_learning.txt'])

fdist1 = FreqDist(text1)
fdist2 = FreqDist(text2)
fdist3 = FreqDist(text3)
fdist4 = FreqDist(text4)
fdist5 = FreqDist(text5)
fdist6 = FreqDist(text6)

shakespeare_list = fdist5.hapaxes()
bacon_list1 = fdist1.hapaxes()
bacon_list2 = fdist2.hapaxes()
bacon_list3 = fdist3.hapaxes()
bacon_list4 = fdist4.hapaxes()
bacon_list6 = fdist6.hapaxes()

bacon_set = set(bacon_list1 + bacon_list2 + bacon_list3 + bacon_list4 + bacon_list6)
bacon_list = []

for i in bacon_set:
	bacon_list.append(i)

shake_numless = []
bacon_numless = []
Example #16
0
print(stopwords)

nostopwords = [word for word in allTokens if word not in stopwords]

# NLTK.Text
text = nltk.Text(nostopwords)

# Collocations (words that frequently appear together)
colos = text.collocations()
print(colos)

# count
print(text.count('inlet'))

# words in similar contexts
print(text.similar('ship'))

text.dispersion_plot(['north', 'south', 'east', 'west'])

text.dispersion_plot(['ship', 'dock', 'boat', 'canoe', 'steamboat'])

# Frequency distributions!

from nltk import FreqDist
fdist = FreqDist(text)

print(fdist.hapaxes())  # words that occur only once

print(fdist.most_common(50))

fdist.plot(30)
Example #17
0
    return len(set(text)) / len(text)


# List
nltk.book.sent1

# Frequency Distributions
from nltk import FreqDist
fdist1 = FreqDist(text1)

# Como generar descriptores del texto. Las palabras más repetidas
# Frequency Distribution Plot cumulative.
fdist1.plot(50, cumulative=True)

# Las palabras menos repetidas
fdist1.hapaxes()

# Long Words using len(w)
# "the set of all w such that w is an element of V (the vocabulary) and w has property P".
V = text1
long_words = [w for w in V if len(w) > 15]
sorted(long_words)

# Long words and frecuenty words to typifing text
fdist5 = FreqDist(nltk.book.text5)
sorted(w for w in set(nltk.book.text5) if len(w) > 7 and fdist5[w] > 7)

# Bigrams
from nltk.util import bigrams
list(bigrams(['more', 'is', 'said', 'than', 'done']))
def part1():
    # Files: input (read as a single string mode) and output files(write mode)
    inf = open("microblog2011.txt").read()
    outa = open("microblog2011_tokenized.txt", 'w')
    outb = open("Tokens.txt", 'w')

    # Initializing Tweet Tokenizer and writing it in the output file.
    tknzr = TweetTokenizer()
    a = tknzr.tokenize(inf)
    outa.writelines(str(a))

    # How many tokens did you find in the corpus? How many types (unique tokens) did you have? What is the type/token ratio for the corpus?
    print('Total number of tokens found in the corpus: ' + str(len(a)))
    print('Total number of unique Token Types did we have: ' +
          str(len(set(a))))
    print('Type/Token Ratio for a (Tokenized Origianl) (Lexical Diversity): ' +
          str(len(set(a)) / len(a)))

    # For each token, print the token and its frequency in a file called Tokens.txt
    # (from the most frequent to the least frequent) and include the first 100 lines in your report.
    fdist1 = FreqDist(a)
    outb.write(str(fdist1.most_common(10000000)))

    # How many tokens appeared only once in the corpus?
    print('Total number of tokens found in the corpus only once: ' +
          str(len(fdist1.hapaxes())))

    # From the list of tokens, extract only words, by excluding punctuation and other symbols.
    # How many words did you find?
    # List the top 100 most frequent words in your report, with their frequencies.
    # What is the type/token ratio when you use only word tokens (called lexical diversity)
    b = []
    for tokens in a:
        a_withoutsymbols = strip_all_entities(strip_links(tokens))
        b.append(a_withoutsymbols)
    print('After stripping the tokens of all symbols, words found: ' +
          str(len(b)))
    fdist2 = FreqDist(b)
    print('The top 100 most common tokens with their frequencies: ' +
          str(fdist2.most_common(100)))
    print('Type/Token Ratio for b (only words)(Lexical Diversity): ' +
          str(len(set(b)) / len(b)))

    # From the list of words, exclude stopwords. List the top 100 most frequent words and their frequencies.
    # You can use this list of stopwords (or any other that you consider adequate, or NLTK stopwords [recommended!]).
    stop_words = set(stopwords.words('english'))
    filtered_sentence = []
    for word in b:
        if word not in stop_words:
            filtered_sentence.append(word)
    fdist3 = FreqDist(filtered_sentence)
    print('The top 100 most common tokens with their frequencies: ' +
          str(fdist3.most_common(100)))
    print(
        'Type/Token Ratio for filtered_sentence (stopwords)(only words)(Lexical Diversity): '
        + str(len(set(filtered_sentence)) / len(filtered_sentence)))

    # Compute all the pairs of two consecutive words (excluding stopwords and punctuation).
    # List the most frequent 100 pairs and their frequencies in your report.
    # Also compute the type/token ratio when you use only word tokens without stopwords (called lexical density)?
    bigram_list = list(bigrams(filtered_sentence))
    fdist4 = FreqDist(bigram_list)
    print("The top 100 most common tokens with their frequencies: " +
          str(fdist4.most_common(100)))
    print(
        'Type/Token Ratio for filtered_sentence (stopwords)(only words)(Lexical Diversity): '
        + str(len(set(bigram_list)) / len(bigram_list)))

    # Extract multi-word expressions (composed of two or more words, so that the meaning of the expression
    # is more than the composition of the meanings of its words).
    # Use NLTK and Python (explain how).
    # List the most frequent 100 expressions extracted.
    token = filtered_sentence
    mwe = range_ngrams(token, ngramRange=(1, 6))
    fdist5 = FreqDist(mwe)
    print('The top 100 most common tokens with their frequencies: ' +
          str(fdist5.most_common(100)))

    # Closing both output files.
    outa.close()
    outb.close()
Example #19
0
class Analyzer(object):
    def __init__(self, text):
        self.text = text
        self.token_counts = FreqDist(text)
    
    def numberOfTokens(self):
        # returns number of tokens in the text
        return len(self.text)
    
    def vocabulary(self):
        # returns a list of the vocabulary of the text sorted alphabetically.
        return sorted(set(self.text))
    
    def vocabularySize(self):
        # returns the size of the vocabulary
        return len(self.vocabulary())
    
    def lexicalRichness(self):
        # returns the lexical richness of the text
        return self.numberOfTokens() / self.vocabularySize()
    
    def hapaxes(self):
        # returns all hapaxes of the text'''
        return self.token_counts.hapaxes()
    
    def numberOfHapaxes(self):
        # returns the number of hapaxes in the text'''
        return len(self.hapaxes())
    
    def avWordLength(self):
        # returns the average word length of the text'''
        sum = 0
        for word in self.token_counts:
            sum = sum + len(word)
        return (int(sum / self.vocabularySize()))

    def topSuffixes(self):
        # returns the 10 most frequent 2-letter suffixes in words'''
        # restrict to words of length 5 or more
        freq = {}
        listsuf = []
        for word in self.vocabulary():
            if len(word) >= 5:
                if word[-2:] in freq:
                    freq[word[-2:]] = freq[word[-2:]] + 1
                else:
                    freq[word[-2:]] = 1

        for key, value in sorted(freq.items(), key=lambda x: x[1], reverse=True):
            listsuf.append(key)

        return listsuf[:10]

    def topPrefixes(self):
        # returns the 10 most frequent 2-letter prefixes in words'''
        # restrict to words of length 5 or more
        freq = {}
        listpre = []
        for word in self.vocabulary():
            if len(word) >= 5:
                if word[:2] in freq:
                    freq[word[:2]] = freq[word[:2]] + 1
                else:
                    freq[word[:2]] = 1

        for key, value in sorted(freq.items(), key=lambda x: x[1], reverse=True):
            listpre.append(key)

        return listpre[:10]
    
    def tokensTypical(self):
        # returns first 5 tokens of the (alphabetically sorted) vocabulary
        # that contain both often seen prefixes and suffixes in the text. Hint: use topPrefixes()
        # and topSuffixes() methods
        toppre = self.topPrefixes()
        topsuf = self.topSuffixes()
        listtoken = []
        for token in self.vocabulary():
            if token[:2] in toppre and token[-2:] in topsuf:
                listtoken.append(token)
        return (listtoken[:5])
Example #20
0
fd = FreqDist(brown.words())

# Find the most frequent words in a text:
# http://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
import operator
max(fd.iteritems(), key=operator.itemgetter(1))
sorted(fd.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
# Or use the wrapper function
fd.most_common(10)

# plot the most frequent words
fd.plot(10)
fd.plot(10, cumulative=True)

# See the words with lowest frequency (these words are called hapaxes)
fd.hapaxes()

# Count all the words
len(text1)
# count unique words
len(set(text1))
# count unique words, irrespective of word case
len(set(w.lower() for w in text1))


# Find the words that are more than 15 characters long
words = set(brown.words())
long_words = [w for w in words if len(w) > 15]

# Words that are more frequent than 7 times and are more than 7 characters long
rare_and_long = sorted(w for w in set(brown.words()) if len(w) > 7 and fd[w] > 7)
tokenized_text = [nltk.word_tokenize(each_case) for each_case in cleaned_text]
tokenized_text = [[
    stemmer.stem(word) for word in each_case if word not in stopwords
] for each_case in tokenized_text]

tot_text = list(chain.from_iterable(tokenized_text))
fdist = FreqDist(tot_text)
wordList = fdist.values()
wordArray = np.array(wordList)
print '50% quantile word count of', np.percentile(wordArray, 50)
print fdist.most_common(30)
#plotting fdist on a cumulative chart
fdist.plot(30, cumulative=True)
#plotting fdist on a non cumulative chart
fdist.plot(30)
print 'seldom appearing words:', fdist.hapaxes()

tfidf_text = []
for each_case in tokenized_text:
    tfidf_text.append(' '.join(word for word in each_case))
#tfidf_text

#create a tfidf vectorizer to convert the text into tfidf
tfidf_vectorizer = TfidfVectorizer(min_df=10, max_df=1.0)
tfidf = tfidf_vectorizer.fit_transform(tfidf_text)

feature_names = tfidf_vectorizer.get_feature_names()

#examining each feature in the document and also their corresponding tfidf
for col in tfidf.nonzero()[1]:
    print feature_names[col], ' - ', tfidf[0, col], ' - ', tfidf.indices[col]
import nltk
from nltk.corpus import gutenberg  # 导入 gutenberg 集
##################################################################
## FreqDist 跟踪分布中的采样频率 (sample frequencies)
from nltk import FreqDist  # 导入 FreqDist 类
fd = FreqDist(gutenberg.words('austen-persuasion.txt'))  # 频率分布实例化, 统计文本中的 Token
print(fd)  # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token
print(type(fd))  # <class 'nltk.probability.FreqDist'>
print(fd['the'])  # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典
print(fd.N())  # 98171; 是单词, 不是字母, 有重复的
print(fd.B())  # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中
print(len(fd.keys()), type(fd.keys()))  # 6132 <class 'dict_keys'>
print(fd.keys())  # fd.B() 只是输出个数, 这个是把所有词汇表输出
print(fd.max())  # 频率最高的一个词
print(fd.freq('the'))  # 0.03178127960395636; 出现频率 3120 / 98171
print(fd.hapaxes())  # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词
# 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征
for idx, word in enumerate(fd):  # 可以用 enumerate 来遍历, 是按出现顺序排的
    if idx == 5: break
    print(idx, word)  # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist)  # <FreqDist with 16 samples and 98171 outcomes>
print(fdist.items())  # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)])
print(fdist.most_common(3))  # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################
## 统计 英文字符
fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha())  # 可以不用 [] 将生成器 list 化
print(fdist.most_common(5))  # [('e', 46949), ('t', 32192), ('a', 29371), ('o', 27617), ('n', 26718)]
print([char for (char, count) in fdist.most_common()])  # 26 个字母使用频率排序
Example #23
0
        w for w in word_tok.tokenize(text_y) if w.lower() not in stop_words
    ]

    words_n_lemmatized = [ger.lemmatise(w) for w in words_n]
    words_y_lemmatized = [ger.lemmatise(w) for w in words_y]

    fdistn = FreqDist(words_n)
    fdisty = FreqDist(words_y)

    most_common_n = fdistn.most_common(50)
    most_common_y = fdisty.most_common(50)

    print(fdistn[i])
    print(fdisty[i])

    hapax_n = FreqDist.hapaxes(fdistn)
    hapax_y = FreqDist.hapaxes(fdisty)

    list_n = [
        n[0] for n in most_common_n if n[0][0].isupper() and len(n[0]) > 1
    ]
    list_y = [
        n[0] for n in most_common_y if n[0][0].isupper() and len(n[0]) > 1
    ]

    # print(list_n)
    # print(list_y)

    print(set(list_n).difference(list_y))
    print(set(list_y).difference(list_n))
from nltk import FreqDist  # 导入 FreqDist 类
fd = FreqDist(
    gutenberg.words('austen-persuasion.txt'))  # 频率分布实例化, 统计文本中的 Token
print(
    fd
)  # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token
print(type(fd))  # <class 'nltk.probability.FreqDist'>
print(fd['the'])  # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典
print(fd.N())  # 98171; 是单词, 不是字母, 有重复的
print(fd.B()
      )  # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中
print(len(fd.keys()), type(fd.keys()))  # 6132 <class 'dict_keys'>
print(fd.keys())  # fd.B() 只是输出个数, 这个是把所有词汇表输出
print(fd.max())  # 频率最高的一个词
print(fd.freq('the'))  # 0.03178127960395636; 出现频率 3120 / 98171
print(fd.hapaxes())  # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词
# 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征
for idx, word in enumerate(fd):  # 可以用 enumerate 来遍历, 是按出现顺序排的
    if idx == 5: break
    print(idx, word)  # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist)  # <FreqDist with 16 samples and 98171 outcomes>
print(
    fdist.items()
)  # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)])
print(fdist.most_common(3))  # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################
## 统计 英文字符
fdist = nltk.FreqDist(ch.lower()
Example #25
0
class Analyzer(object):
    def __init__(self, path):
        '''reads the file text, creates the list of words (use nltk.word_tokenize to tokenize the text),
            and calculates frequency distribution '''
        with open(path, 'r') as file:
            self.text = word_tokenize(file.read())
        #self.text = word_tokenize(open(path,'r').read()) #TODO the list of words from text file
        self.token_counts = FreqDist(
            self.text)  #TODO frequency distribution of words from text file

    def numberOfTokens(self):
        '''returns number of tokens in the text '''
        return len(self.text)

    def vocabularySize(self):
        '''returns a list of the vocabulary of the text '''
        return len(self.token_counts)

    def lexicalDiversity(self):
        '''returns a list of lexical diversity of the text '''
        # Höhe Diversity: mehr unterschiedliche Wörter
        return self.numberOfTokens() / self.vocabularySize()

    def getKeywords(self):
        '''return words as possible key words, that are longer than seven characters, that occur more than seven times (sorted alphabetically)'''
        keys = []
        for key, value in self.token_counts.items():
            if len(key) > 7 and value > 7:
                keys.append(key)
        return sorted(keys)

        #Musterlösung     : iterier Typs , and iterier das Filter
        #return sorted([w for w in self.token_counts.keys() if len(w)>7 and self.token_counts[w]>7])

    def numberOfHapaxes(self):
        '''returns the number of hapaxes in the text'''
        return len(self.token_counts.hapaxes())

    def avWordLength(self):
        #此处题目表述为 所有不同词汇的平均值,而不是所有词汇的平均值'''returns the average word length of the text'''
        #Musterlösung
        #return sum([len(word) for word in self.token_counts])/len(self.token_counts)

        sumWordLen = 0
        for word in self.token_counts:
            sumWordLen = sumWordLen + len(word)
        return sumWordLen / len(self.token_counts)

    def topSuffixes(self):
        '''returns the 10 most frequent 2-letter suffixes in words
            (restrict to words of length 5 or more)'''

        #Musterlösung
        #list_of_words = [word for word in self.token_counts if len(word) >=5]
        #suf_dict = FreqDist(suf[-2:] for suf in list_of_words)
        #suf_most_freq =[elem[0] for elem in suf_dict.most_common(10)]
        #return suf_most_freq

        suffixes = []
        for langWord in self.token_counts.keys():
            if len(langWord) >= 5:
                suffixes.append(langWord[-2:])
        return [word for word, count in Counter(suffixes).most_common(10)]

    def topPrefixes(self):
        '''returns the 10 most frequent 2-letter prefixes in words
            (restrict to words of length 5 or more)'''
        suffixes = []
        for langWord in self.token_counts.keys():
            if len(langWord) >= 5:
                suffixes.append(langWord[:2])
        return [word for word, count in Counter(suffixes).most_common(10)]

    def tokensTypical(self):
        """TODO returns first 5 tokens of the (alphabetically sorted) vocabulary 
        that contain both often sccleen prefixes and suffixes in the text. As in topPrefixes()
        and topSuffixes(), Prefixes and Suffixes are 2 characters long."""

        sufixes = self.topSuffixes()
        prefixes = self.topPrefixes()
        return sorted([
            word for word in self.token_counts.keys()
            if word[:2] in prefixes and word[-2:] in sufixes
        ])[:5]
Example #26
0
from nltk.corpus import stopwords
#Definimos el idioma
stoplist = stopwords.words('spanish')

#Frase con mucha basura
test_text = "El a ante con contra desde en un a el la o y puede que no jamón"
#Se tokeniza la frase y se compara cada palabra con la lista de stopwordsself
#Nos quedamos con la lista limpia
clean_text = [
    word for word in regexp_tokenize(test_text, '\w+')
    if word.lower() not in stoplist
]
print(clean_text)
'''###########################
   # Eliminar palabras raras #
   ###########################

   Por que no ayuda tener nombres o palabras muy cortas/largas
'''

from nltk import FreqDist
# Se calcula la distancia entre las repeticiones
# de cada palabra, de forma que si no es frecuente
# es decir, una palabra rara, se quitará.
frecuencia_distancia = FreqDist(tokens)
raras = frecuencia_distancia.hapaxes()
limpieza_raras = [word for word in tokens if word not in raras]

print(tokens)
print(limpieza_raras)
Example #27
0
import nltk
from nltk import FreqDist
from nltk.corpus import brown
from nltk.corpus import inaugural
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import math

brown_freq = FreqDist(brown.words())
print(brown_freq.most_common(10))
print(brown_freq["mother"])

for word in brown_freq.most_common():
    print("{} ~ {}".format("the", round(brown_freq.freq("the"), 2)))

print(sorted(brown_freq.hapaxes(), key=lambda w: len(w), reverse=True)[:20])

cats = ['mystery', 'adventure']
cfd = nltk.ConditionalFreqDist((genre, word.lower()) for genre in cats
                               for word in brown.words(categories=genre))
print(cfd)

for cond in cfd.conditions():
    print(cond)
    print(cfd[cond].most_common(20))
    print()

for cond in cfd.conditions():
    print("mother in {} - {} - {}".format(cond, cfd[cond]["mother"],
                                          round(cfd[cond].freq("mother"), 4)))
def percentage(count, total):
    return 100 * count / total

lexical_diversity(text4)
percentage(text4.count('a'), len(text4))

# Simple statistics
from nltk import FreqDist
# Counting Words Appearing in a Text (a frequency distribution)
fdist1 = FreqDist(text4)
fdist1
vocabulary1 = fdist1.keys() # list of all the distinct types in the text
vocabulary1[:3] # look at first 3

#words that occur only once, called hapaxes 
fdist1.hapaxes()[:20]

# Words that meet a condition, are long for example
V = set(text4)
long_words = [w for w in V if len(w) > 15]
sorted(long_words)

#finding words that characterize a text, relatively long, and occur frequently
fdist = FreqDist(text4)
sorted([w for w in set(text4) if len(w) > 7 and fdist[w] > 7])

# Collocations and Bigrams. 
# A collocation is a sequence of words that occur together unusually often. 
# Built in collocations function
text4.collocations()
Example #29
0
from nltk.book import text1
from nltk.book import text4
from nltk import FreqDist
import nltk
Freq_Dist = FreqDist(text1)
print(Freq_Dist)
print(Freq_Dist.most_common(10))
print(Freq_Dist['his'])
Freq_Dist.plot(50, cumulative=False)
Freq_Dist.plot(50, cumulative=True)
Freq_Dist.hapaxes()
Once_happend = Freq_Dist.hapaxes()
print(Once_happend)
print(text4.count('america') / float(len(text4) * 100))

Value_set = set(text1)
long_words = [words for words in Value_set if len(words) > 17]
print(sorted(long_words))
my_text = ["Here", "are", "some", "words", "that", "are", "in", "a", "list"]
vocab = sorted(set(my_text))
print(vocab)
word_freq = nltk.FreqDist(my_text)
print(word_freq.most_common(5))