Exemple #1
0
def preprocessingText(f_path, t_path, min_len):
    """
    This function reduce the content of the text in orden to make easier the posterior analysis. 
    Should pay attention on text quality.
    Parameters:
    
    f_path = root-folder
    t_path = path where texts are stored
    """
    def elimina_tildes(cadena):
        s = ''.join((c for c in unicodedata.normalize('NFD', cadena)
                     if unicodedata.category(c) != 'Mn'))
        return s

    # stopwords and punctuation for removal
    stop_words = set(stopwords.words('spanish'))
    punctuation_marks = set(punctuation)
    stop_words_punctuation_marks = stop_words.union(punctuation_marks)

    txt_folder = f_path + 'c_texts/'
    if not os.path.isdir(txt_folder):
        os.mkdir(txt_folder)

    for i in range(0, len(t_path)):

        infile = t_path[i]

        with open(infile, 'r') as f:

            text = f.read()

            # split into tokens
            tokens = nltk.word_tokenize(text)
            tokens = [word.lower() for word in tokens]

            # remove stopwords and punctuation marks
            # remove all tokens that are not alphabetic
            words = [
                word for word in tokens
                if word not in stop_words_punctuation_marks
            ]
            words = [word for word in words if word.isalpha()]

            # Extract words with minimun or maximum length
            min_words = [word for word in words if len(word) >= min_len]
            fdist = FreqDist(min_words)
            hapaxes = fdist.hapaxes()

            # eliminar tildes
            texto = ' '.join(hapaxes)
            texto = elimina_tildes(texto)

        f.closed

        outfile = str(infile.split('/')[-1])
        with open(txt_folder + outfile, 'w') as w:
            w.write(texto)
        w.closed
def ProcessaArquivo(f):
    """Calcula estatísticas do arquivo dado."""
    print "Processando arquivo %s..." % f
    corpus=CriaLeitorDeCorpus(arquivo=f)
    tokens=corpus.words()
    print "Quantidade de tokens: %d." % len(tokens)
    alfabeticas=ExtraiAlfabeticas(tokens)
    print "Quantidade de tokens alfabéticos: %d." % len(alfabeticas)
    freq=FreqDist(alfabeticas)
    print "Diversidade lexical: %.2f%%" % CalculaDiversidadeLexical(freq)
    print "Quantidade de hapaxes: %d.\n\n\n" % len(freq.hapaxes())
Exemple #3
0
def count_words(filename):
    #Reads a file, counts the words, and saves the output in a dictionary
    infile = open(filename, 'r', encoding='utf-8')

    file = infile.read()

    infile.close()

    fdist = FreqDist(word.lower() for word in word_tokenize(file))

    return len(fdist.hapaxes())
Exemple #4
0
def replace_with_UNKING(tags_words):
    tags_words = tags_words
    fdist = FreqDist(words_train)
    hapaxes = fdist.hapaxes()
    for l in range(len(tags_words)):
        if tags_words[l][1] in hapaxes or tags_words[l][1] not in words_train:
            if tags_words[l][1].endswith("ing"):
                tags_words[l] = list(tags_words[l])
                tags_words[l][1] = "UNK-ING"
                tags_words[l] = tuple(tags_words[l])
    return tags_words
def replace_with_UNKAR(tags_words):
    tags_words = tags_words
    fdist = FreqDist(words_train)
    hapaxes = fdist.hapaxes()
    for l in range(len(tags_words)):
        if tags_words[l][1] in hapaxes or tags_words[l][1] not in words_train:
            if tags_words[l][1].endswith(
                    "ar") and tags_words[l][1][0].islower():
                tags_words[l] = list(tags_words[l])
                tags_words[l][1] = "UNK-AR"
                tags_words[l] = tuple(tags_words[l])
    return tags_words
def replace_with_UNKCAP(tags_words):
    tags_words = tags_words
    fdist = FreqDist(words_train)
    hapaxes = fdist.hapaxes()
    for l in range(len(tags_words)):
        if tags_words[l][1] in hapaxes or tags_words[l][1] not in words_train:
            if tags_words[l][1][0].isupper() and tags_words[l -
                                                            1][1] != "START":
                tags_words[l] = list(tags_words[l])
                tags_words[l][1] = "UNK-CAP"
                tags_words[l] = tuple(tags_words[l])
    return tags_words
Exemple #7
0
def clean_reviews_by_label(files, label):
    """
    Clean reviews from service words for one label and add bigrams.
    """
    # word_matrix - list of all words and bigrams of common words
    # all_words - all unique words and all bigrams (most rare will be drop out)
    data = {'word_matrix': [], 'all_words': []}
    # temporary list of all words in all reviews and theit bigrams
    # (final list will be shorter as rare words will be removed)
    allwords = []
    # create tokenizer
    tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+')
    n = len(files)
    for i, filepath in enumerate(files):
        if (i + 1) % 1000 == 0:
            print('{}: {}/{} docs processed'.format(label, i + 1, n))
        # read review and tokenize it
        f = open(filepath)
        bag_words = tokenizer.tokenize(f.read())
        f.close()
        # get part of speech for each word
        lower_words = get_part_of_speech(bag_words)
        # drop service words
        informative_words = choose_informative_words(lower_words)
        # form list of important words by words itself and bigrams
        tokens_bigrams_list = list(
            bigrams(informative_words)) + informative_words
        # add list of words in purpose to calculate frequency by document next
        data['word_matrix'].append(tokens_bigrams_list)
        # add words to big list of all words in all reviews
        allwords.extend(informative_words)
    # find frequencies for all words
    frequencies = FreqDist(allwords)
    # find the least frequent words
    hapaxes = frequencies.hapaxes()
    # remove them
    data['all_words'] = list(set(allwords) - set(hapaxes))
    return {label: data}
Exemple #8
0
ascii_tokens = []
for token in corpus_tokenized:
    try:
        token.decode('ascii')
        if not contains_digits(token):
            ascii_tokens.append(token)
    except:
        continue

ascii_tokens_lowered = []
for token in ascii_tokens:
    ascii_tokens_lowered.append(token.lower())
fdist = FreqDist(ascii_tokens)
fdist_lowered = FreqDist(ascii_tokens_lowered)
hapaxes = fdist.hapaxes()
print('Number of hapaxes before trimming: ' + str(len(hapaxes)))
lowered_hapaxes = fdist_lowered.hapaxes()
lowered_hapax_dict = {}
for lowered_hapax in lowered_hapaxes:
    lowered_hapax_dict[lowered_hapax] = True
tmp_hapaxes = [] # necessary because removing from hapaxes while looping through it caused subtle bug
for hapax in hapaxes:
    # Remove hapaxes which are only hapaxes because of capitalization
    if hapax.lower() in lowered_hapax_dict:
        tmp_hapaxes.append(hapax)
hapaxes = tmp_hapaxes
print('Number of hapaxes after trimming: ' + str(len(hapaxes)))

# Tweet a random hapax
from nltk.corpus import brown
import matplotlib.pyplot as plot
import pylab
from math import log

# Get the case insensitive words from the brown corpus
case_inses_words = [word.lower() for word in brown.words()]
no_of_tokens = len(case_inses_words)
print("Total No of Tokens in Brown Corpus ", no_of_tokens)

# Pass it on to FreqDist to get Frequency Distributions
fdist = FreqDist(case_inses_words)
print(fdist)

# Compute the Percentage of Hapax Legomena's Occurrences and the longest in them
hapax_legomenas = fdist.hapaxes() # Get the list of words that appeared just once in corpus
hapax_legomena_counts = len(hapax_legomenas) # Get the count of them
percentage_of_hapax_legomena = (hapax_legomena_counts/no_of_tokens)*100 # Compute percentage
print("Percentage of Hapax Legomena Occurrences", percentage_of_hapax_legomena)
max_len_happax_legomena = max([len(word) for word in hapax_legomenas])
print("Longest happax Legomena's are", [word for word in hapax_legomenas if len(word) == max_len_happax_legomena])

# Compute the Percentage of dis legomena Occurrences and the longest in them
dis_legomenas = [key for key, value in fdist.items() if value == 2] # Get the words that occurred just twice
dis_legomena_counts = len(dis_legomenas) * 2 # Get their counts
percentage_of_dis_legomena = (dis_legomena_counts/no_of_tokens)*100 # Compute percentage
print("Percentage of Dis Legomena Occurrences", percentage_of_dis_legomena)
max_len_dis_legomena = max([len(word) for word in dis_legomenas])
print("Longest Dis Legomena's are ", [word for word in dis_legomenas if len(word) == max_len_dis_legomena])

# Plot the r vs Nr graph
Exemple #10
0
def frequencyDistribution(tokenWords):
    freqWords = FreqDist(tokenWords)
    print(freqWords.most_common(10))
    print(freqWords.hapaxes())
    pt.plot()
    freqWords.plot(30, cumulative=True)
Exemple #11
0
print()
"""

print('Frequency distributions')

from nltk.probability import FreqDist

print('FreqDist of Batman theme song')
batman_fd = FreqDist(
    nltk.word_tokenize(
        'na na na na na na na na na na na na na na na na Bat Man!'))
print(batman_fd.most_common(5))
print()
input()

print('How about words that only occur once? ("singletons" or "hapaxes")')
print(batman_fd.hapaxes())
print()
input()

print('What are the 50 most common words in Genesis?')
fd3 = FreqDist(text3)
print(fd3.most_common(50))
print()
input()

print('How frequent is "prayed" in Genesis?')
# FreqDist objects are sub-types of dict
print(fd3['prayed'])
print()
Exemple #12
0
class StyloDocument(object):

    DEFAULT_AUTHOR = "Unknown"

    def __init__(self, file_content, author=DEFAULT_AUTHOR):
        self.author = author.strip()
        self.raw_content = file_content
        self.file_content = file_content.lower()
        self.tokens = PortugueseTextualProcessing.tokenize(self.file_content)
        self.text = Text(self.tokens)
        self.fdist = FreqDist(self.text)
        self.sentences = sent_tokenize(self.file_content, language='portuguese')
        self.sentence_chars = [len(sent) for sent in self.sentences]
        self.sentence_word_length = [len(sent.split()) for sent in self.sentences]
        self.paragraphs = [p for p in self.file_content.split("\n\n") if len(p) > 0 and not p.isspace()]
        self.paragraph_word_length = [len(p.split()) for p in self.paragraphs]
        self.punctuation = [".", ",", ";", "-", ":"]
        self.ner_entities = ['ABSTRACCAO', 'ACONTECIMENTO', 'COISA', 'LOCAL',
                             'ORGANIZACAO', 'OBRA', 'OUTRO', 'PESSOA', 'TEMPO', 'VALOR']
        self.white_spaces = len(self.file_content.split(' '))

        self.rich_tags = RichTags(PortugueseTextualProcessing.get_rich_tags(self.file_content), len(self.text))
        self.tagged_sentences = PortugueseTextualProcessing.postag(self.tokens)
        self.tagfdist = FreqDist([b for [(a, b)] in self.tagged_sentences])
        self.ner_tags = PortugueseTextualProcessing.ner_chunks(self.tokens)
        self.ner_ftags = FreqDist(self.ner_tags)
        self.spell = SpellChecker(language='pt')
        self.ROUNDING_FACTOR = 4
        self.LINE_BREAKS = ['\n', '\t', '\r']

    def get_tag_count_by_start(self, tag_start):
        count = 0
        for tag in self.tagfdist.keys():
            if tag.startswith(tag_start):
                count += self.tagfdist[tag]
        return count

    def get_class_frequency_by_start(self, tag_start):
        return self.get_tag_count_by_start(tag_start)/self.tagfdist.N()

    def get_total_not_found(self):
        """"The wn is not being reliable so far"""
        nf_tokens = self.get_tokens_by_tag('notfound')
        return len([i for i in nf_tokens if len(wn.synsets(i, lang='por')) == 0])

    def tag_frequency(self, tag):
        return self.tagfdist.freq(tag)

    def entity_frequency(self, tag):
        return self.ner_ftags.freq(tag)

    def get_tokens_by_tag(self, tag):
        return [i[0][0] for i in self.tagged_sentences if i[0][1] == tag]

    def get_long_sentence_freq(self):
        return (len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.LONG_SENTENCE_SIZE]))/len(self.sentences)

    def get_short_sentence_freq(self):
        return (len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.SHORT_SENTENCE_SIZE]))/len(self.sentences)

    def get_long_short_sentence_ratio(self):
        """"RF FOR PAN 15"""
        return len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.LONG_SENTENCE_SIZE])/(len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.SHORT_SENTENCE_SIZE]))

    def get_sentence_starting_tags_ratio(self, tag):
        count = [i[0][1] for i in self.tagged_sentences].count(tag)
        return count/len(self.sentences)

    def term_per_hundred(self, term):
        """
        term       X
        -----  = ------
          N       100
        """
        return (self.fdist[term] * 100) / self.fdist.N()

    def mean_sentence_len(self):
        return np.mean(self.sentence_word_length)

    def std_sentence_len(self):
        return np.std(self.sentence_word_length)

    def mean_paragraph_len(self):
        return np.mean(self.paragraph_word_length)

    def std_paragraph_len(self):
        return np.std(self.paragraph_word_length)

    def flesh_index(self):
        idx, value = PortugueseTextualProcessing().get_ptBR_flesch_index(self.tokens, self.get_phrases())
        return idx

    def vocabulary(self):
        return [v for v in sorted(set(self.sentences)) if v not in self.punctuation]

    def mean_word_len(self):
        words = set(word_tokenize(self.file_content, language='portuguese'))
        word_chars = [len(word) for word in words]
        return sum(word_chars) / float(len(word_chars))

    def max_word_len(self):
        words = set(word_tokenize(self.file_content, language='portuguese'))
        return max([len(word) for word in words])

    def type_token_ratio(self):
        return (len(set(self.text)) / len(self.text)) * 100

    def unique_words_per_hundred(self):
        return self.type_token_ratio() / 100.0 * 100.0 / len(self.text)

    def document_len(self):
        return sum(self.sentence_chars)

    def get_phrases(self):
        return [i for i in self.file_content.split('.') if i != '']

    def mean_syllables_per_word(self):
        _, syllable_count = PortugueseTextualProcessing().get_syllable_counts(self.tokens)
        return syllable_count/len(self.tokens)

    def characters_frequency(self, character_list):
        return self.frequency([word for word in self.file_content if word in character_list])

    def digits_frequency(self):
        return self.frequency([word for word in self.file_content if word.isdigit()])

    def line_breaks_frequency(self):
        return self.frequency([word for word in self.file_content if word in self.LINE_BREAKS])

    def count_consonant_frequency(self):
        character_list = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w',
                          'y', 'x', 'z']
        return self.frequency([word for word in self.file_content if word in character_list])

    def camel_case_frequency(self):
        return self.frequency([word for word in self.raw_content.split(' ') if word and word[0].isupper() and (len(word) == 1 or word[1].islower())])

    def local_hapax_legommena_frequency(self):
        return (len(self.fdist.hapaxes()))/len(self.text.tokens)

    def collocations_frequency(self, size):
        """words that often appear consecutively in the window_size"""
        return (len(self.text.collocation_list(window_size=size)))/len(self.text.tokens)

    def most_frequent_word_size(self):
        return FreqDist(len(w) for w in self.text).max()

    def mean_frequent_word_size(self):
        return FreqDist(len(w) for w in self.text).most_common(3)[1][0]

    def guiraud_R_measure(self):
        return (len(set(self.text)))/math.sqrt(len(self.text))

    def herdan_C_measure(self):
        # log V(N)/log N
        return (math.log2(len(set(self.text))))/math.log2(len(self.text))

    def herdan_V_measure(self):
        # N ^ C
        return math.pow(len(self.text), self.herdan_C_measure())

    def K_measure(self):
        # log V(N)/log(log(N))
        return (math.log2(len(set(self.text)))) / math.log2(math.log2(len(self.text)))

    def dugast_U_measure(self):
        # log ^ 2 N/log(N) - log V(N)
        return (math.pow(math.log2(len(self.text)), 2)) / (math.log2(len(self.text)) - math.log2(len(set(self.text))))

    def maas_A_measure(self):
        #a ^ 2 = logN - logV(N)/log ^ 2 N
        return math.sqrt((math.log2(len(self.text)) - math.log2(len(set(self.text))))
                          / math.pow(math.log2(len(self.text)), 2))

    def LN_measure(self):
        # 1 - V(N) ^ 2/ V(N) ^ 2 log N
        return (1 - math.pow(len(set(self.text)),2)) / (math.pow(len(set(self.text)), 2) * math.log2(len(self.text)))

    def honores_H_measure(self):
        return (len(self.fdist.hapaxes()))/len(set(self.text))

    def spell_miss_check_frequency(self):
        return self.frequency(self.spell.unknown(self.text))

    def noun_phrases(self):
        return PortugueseTextualProcessing().get_number_of_noun_phrases(self.tokens) / len(self.text)

    def verb_phrases(self):
        return self.frequency(PortugueseTextualProcessing().get_number_of_verb_phrases(self.file_content))

    def monosyllables(self):
        return PortugueseTextualProcessing().get_monosyllable_counts(self.tokens) / len(self.text)

    def repeated_words_frequency(self):
        repeated_words = list(filter(lambda x: x[1] >= 2, FreqDist(PortugueseTextualProcessing().remove_stopwords(self.tokens)).items()))
        return self.frequency(repeated_words)

    def stop_word_freq(self):
        clean_words = PortugueseTextualProcessing().remove_stopwords(self.tokens)
        return (len(self.tokens) - len(clean_words)) / len(self.text)

    def get_logical_operator_frequency(self):
        return self.frequency([token for token in self.tokens if token in PortugueseTextualProcessing.LOGICAL_OPERATORS])

    def get_tags_freq(self, tags):
        count = 0
        for tag in tags:
            count += self.get_tag_count_by_start(tag)
        return count/len(self.tokens)

    def find_quotes(self):
        """Improve this method to retrieve quotes based on Patterns and special words
        egs: p.43;  segundo (autor, ano)
        """
        return self.characters_frequency(['“', '”'])

    def frequency(self, input_values):
        return len(input_values) / len(self.text)

    @classmethod
    def csv_header(cls):
        return (
            ['DiversidadeLexica', 'TamanhoMedioDasPalavras', 'TamanhoMedioSentencas', 'StdevSentencas', 'TamanhoMedioParagrafos',
             'StdevTamParagrafos', 'FrequenciaDeParagrafos','FrequenciaPalavrasDuplicadas', 'MediaSilabasPorPalavra',

             'Monossilabas',

             'Ponto','Virgulas', 'Exclamacoes', 'DoisPontos', 'Citacoes', 'QuebrasDeLinha', 'Digitos',

             'Adjetivos', 'Adverbios','Artigos', 'Substantivos', 'Preposicoes', 'Verbos','VerbosPtcp', 'Conjuncoes',
             'Pronomes', 'PronomesPorPreposicao','TermosNaoTageados', 'PalavrasDeConteudo', 'PalavrasFuncionais',
             'FrasesNominais', 'FrasesVerbais', 'GenMasc', 'GenFem', 'SemGenero', 'Singular', 'Plural',

             'PrimeiraPessoa', 'TerceiraPessoa','Passado','Presente','Futuro',

             'TotalEntidadesNomeadas', 'EntAbstracao', 'EntAcontecimento', 'EntCoisa', 'EntLocal', 'EntOrganizacao',
             'EntObra', 'EntOutro', 'EntPessoa', 'EntTempo', 'EntValor',

             'GuiraudR', 'HerdanC', 'HerdanV', 'MedidaK', 'DugastU', 'MaasA', 'HonoresH',

             'PalavrasErroOrtografico', 'HapaxLegomenaLocal', 'PalavrasComunsTam2', 'PalavrasComunsTam3', 'PalavrasComunsTam4',
             'StopWords', 'BRFleshIndex', 'OperadoresLogicos', 'PalavrasCapitalizadas',

             'Author']
        )

    def csv_output(self):
        # TODO: Separate features into syntactical, lexical and so on..
        # 69 features + 1 class
        return "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \
               "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \
               "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \
               "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},'{}'".format(

            # Text style features - 10
            round(self.type_token_ratio(), self.ROUNDING_FACTOR),
            round(self.mean_word_len(), self.ROUNDING_FACTOR),
            round(self.mean_sentence_len(), self.ROUNDING_FACTOR),
            round(self.std_sentence_len(), self.ROUNDING_FACTOR),
            round(self.mean_paragraph_len(), self.ROUNDING_FACTOR),
            round(self.std_paragraph_len(), self.ROUNDING_FACTOR),
            len(self.paragraphs) / len(self.text),
            round(self.repeated_words_frequency(), self.ROUNDING_FACTOR),
            self.mean_syllables_per_word(),
            self.monosyllables(),

            # Term count features - 7
            self.term_per_hundred('.'),
            self.term_per_hundred(','),
            self.term_per_hundred('!'),
            self.term_per_hundred(':'),
            self.find_quotes(),
            self.line_breaks_frequency(),
            self.digits_frequency(),

            #POSTAG Features - 24
            self.tag_frequency('ADJ'),
            self.tag_frequency('ADV'),
            self.tag_frequency('ART'),
            self.tag_frequency('N'),
            self.tag_frequency('PREP'),
            self.tag_frequency('PCP'),  # verbo no participio
            self.get_class_frequency_by_start('V'),
            self.get_class_frequency_by_start('K'), #conjunções
            self.get_class_frequency_by_start('PRO'),
            self.get_class_frequency_by_start('PRO')/self.tag_frequency('PREP'), #used in french texts
            self.tag_frequency('notfound'),
            self.get_tags_freq(PortugueseTextualProcessing.CONTENT_TAGS),
            self.get_tags_freq(PortugueseTextualProcessing.FUNCTIONAL_TAGS),
            round(self.noun_phrases(), self.ROUNDING_FACTOR),
            round(self.verb_phrases(), self.ROUNDING_FACTOR),
            self.rich_tags.get_male(),
            self.rich_tags.get_female(),
            self.rich_tags.get_unspecified_gender(),
            self.rich_tags.get_singular(),
            self.rich_tags.get_plural(),
            self.rich_tags.get_first_person(),
            self.rich_tags.get_third_person(),
            self.rich_tags.get_past_tense(),
            self.rich_tags.get_present_tense(),
            self.rich_tags.get_future_tense(),


            #NER Features - 11
            round(len(self.ner_tags) / len(self.tokens), self.ROUNDING_FACTOR),
            self.entity_frequency('ABSTRACCAO'),
            self.entity_frequency('ACONTECIMENTO'),
            self.entity_frequency('COISA'),
            self.entity_frequency('LOCAL'),
            self.entity_frequency('ORGANIZACAO'),
            self.entity_frequency('OBRA'),
            self.entity_frequency('OUTRO'),
            self.entity_frequency('PESSOA'),
            self.entity_frequency('TEMPO'),
            self.entity_frequency('VALOR'),

            # Vocabulary diversity features - 7
            round(self.guiraud_R_measure(), self.ROUNDING_FACTOR),
            round(self.herdan_C_measure(), self.ROUNDING_FACTOR),
            round(self.herdan_V_measure(), self.ROUNDING_FACTOR),
            round(self.K_measure(), self.ROUNDING_FACTOR),
            round(self.dugast_U_measure(), self.ROUNDING_FACTOR),
            round(self.maas_A_measure(), self.ROUNDING_FACTOR),
            round(self.honores_H_measure(), self.ROUNDING_FACTOR),

            # Misc Features - 9
            self.spell_miss_check_frequency(),
            round(self.local_hapax_legommena_frequency(), self.ROUNDING_FACTOR),
            self.collocations_frequency(2),
            self.collocations_frequency(3),
            self.collocations_frequency(4),
            round(self.stop_word_freq(), self.ROUNDING_FACTOR),
            self.flesh_index(),
            self.get_logical_operator_frequency(),
            self.camel_case_frequency(),

            self.author,
        )

    def legacy_features(self):
        """Remove features that are here for future reference"""
        # self.count_characters_frequency(['a']),
        # self.count_characters_frequency(['e']),
        # self.count_characters_frequency(['i']),
        # self.count_characters_frequency(['o']),
        # self.count_characters_frequency(['u']),
        # self.count_consonant_frequency(),
        # self.mean_frequent_word_size(),
        # self.max_word_len(),
        # self.document_len(),
        # round(self.LN_measure(), 8)
        pass
stemmed_list_porter = [porter.stem(t) for t in token_union]
stemmed_list_lancaster = [lancaster.stem(t) for t in token_union]
stemmed_list_snowball = [snowball.stem(t) for t in token_union]

df = pandas.DataFrame(
    data={
        'original_token': original_token_list,
        'porter': stemmed_list_porter,
        'lancaster': stemmed_list_lancaster,
        'snowball': stemmed_list_snowball
    })

# Task 4
freq_dist = FreqDist(casual_tokenize(file_str))
most_common = freq_dist.most_common(10)
hapaxes = freq_dist.hapaxes()

freq_dist.plot()
# Se observa, in primul rand, ca majoritatea cuvintelor sunt hapaxe din cauza ca corpus-ul este mic.
# Din acelasi motiv, se observa 'alunecari' din ce in ce mai vizible apropiindu-ne de y=1.
# Daca am fi avut un corpus gigantic, graficul ar fi apropiat de o functie liniara la aparenta.
# Din grafic se observa ca fie avem cuvinte foarte des intalnite, fie avem hapaxe, ceea ce indica ca avem un corpus mic.

# Task 5
pos = pos_tag(casual_tokenize(file_str))
pos_dict = {tag: [] for (word, tag) in pos}

for (word, tag) in pos:
    pos_dict[tag].append(word)

pos_tag_fd = FreqDist(tag for (word, tag) in pos).most_common()
' '.join(['Monty' ,'Python']) #join list to str
'Monty Python'.split()        #split str to list
import matplotlib.pyplot as plt
import pandas as pd
from nltk.probability import FreqDist
fdist_moby = FreqDist(moby_dick)  #frequency distribution
fdist_bible = FreqDist(bible)
fdist_chat = FreqDist(chat)

print(fdist_moby.most_common(10))
fdist_moby.plot(50, cumulative=True); plt.show()
print(fdist_bible.most_common(10))
print(fdist_chat.most_common(10))

print(len(fdist_moby.hapaxes()))  #words that occur once only
long_words = [word for word in moby_dick if len(word) >= 15] #long words 
long_words = sorted(set([w for w in chat if len(w) > 6 and 
              fdist_chat[w] > 7])) #long words w/ more conditions
#length of word and numer of occurences via FreqDist
from nltk import *
print(bigrams(['more', 'is' , 'said', 'than', 'done']))    

print('; '.join(chat.collocation_list())) #common bigrams

fmoby = FreqDist([len(w) for w in moby_dick]) #freq dist of lenghts of words!
print(fmoby.items())  #keys-values : length, apperances of words

#page 44 for more functions!
#page 45 for word comparison operators
       "González volvieron a estallar las redes con un beso “No se logró ni funcionó porque él no quiso”, " \
       "Lina Tejeiro sobre su relación con Andy Rivera Estos son los estrenos que trae Netflix en julio " \
       "“El cemento puede esperar, la prioridades contener la pandemia”: Mello Castro Fiesta y concurso de ‘El más " \
       "comelón’ en La Paz durante el toque de queda Envían a la cárcel a encargados de laboratorio de coca " \
       "en Chimichagua Investigan cerco epidemiológico de primer caso de covid-19 en Manaure Cierran barrios de " \
       "Riohacha donde se presentan mayores brotes de la covid-19 Casas de apuestas en Colombia, un negocio en " \
       "constante auge Conozca Skrill, una de las plataformas más reconocidas para comprar criptomonedas Falla " \
       "mundial en WhatsApp: no muestra última conexión Pasos para descargar WhatsApp Plus gratis"

tokenizer = nltk.RegexpTokenizer(r"\w+")
new_words = tokenizer.tokenize(text)
print(new_words)

tokenized_word = word_tokenize(text)
print(tokenized_word)

fdist = FreqDist(new_words)
print(fdist)

filtered_sent = []
for w in new_words:
    if w not in stop_words:
        filtered_sent.append(w)
print("Tokenized Sentence:", new_words)
print("Filterd Sentence:", filtered_sent)

fdist = FreqDist(filtered_sent)
print(fdist['días'])
print(fdist.hapaxes())
print(fdist.most_common(10))
Exemple #16
0
text = []

#Aggregate all tokens into list
for line in file_source:
    line = line.lower()
    # print TweetTokenizer().tokenize(line)
    text = text + TweetTokenizer().tokenize(line)

#Create frequency distribution
fdist = FreqDist(text)
total_tokens = fdist.N()
unique_tokens = fdist.B()

#Print distribution properties
print "The number of total tokens:", total_tokens
print "The number of unique tokens:", unique_tokens
print "The type/token ratio:", (unique_tokens + 0.0) / total_tokens
print "Number of tokes that only appear once:", len(fdist.hapaxes())

print "\nTokens that only appear once:"
print "======================="
for w in fdist.hapaxes():
    print w

print "\nThe most common tokens:"
print "======================="
for x in fdist.most_common(150000):
    w, n = x
    out = str(n) + '\t' + w
    print out
    file_dist.write(out + '\n')