def preprocessingText(f_path, t_path, min_len): """ This function reduce the content of the text in orden to make easier the posterior analysis. Should pay attention on text quality. Parameters: f_path = root-folder t_path = path where texts are stored """ def elimina_tildes(cadena): s = ''.join((c for c in unicodedata.normalize('NFD', cadena) if unicodedata.category(c) != 'Mn')) return s # stopwords and punctuation for removal stop_words = set(stopwords.words('spanish')) punctuation_marks = set(punctuation) stop_words_punctuation_marks = stop_words.union(punctuation_marks) txt_folder = f_path + 'c_texts/' if not os.path.isdir(txt_folder): os.mkdir(txt_folder) for i in range(0, len(t_path)): infile = t_path[i] with open(infile, 'r') as f: text = f.read() # split into tokens tokens = nltk.word_tokenize(text) tokens = [word.lower() for word in tokens] # remove stopwords and punctuation marks # remove all tokens that are not alphabetic words = [ word for word in tokens if word not in stop_words_punctuation_marks ] words = [word for word in words if word.isalpha()] # Extract words with minimun or maximum length min_words = [word for word in words if len(word) >= min_len] fdist = FreqDist(min_words) hapaxes = fdist.hapaxes() # eliminar tildes texto = ' '.join(hapaxes) texto = elimina_tildes(texto) f.closed outfile = str(infile.split('/')[-1]) with open(txt_folder + outfile, 'w') as w: w.write(texto) w.closed
def ProcessaArquivo(f): """Calcula estatísticas do arquivo dado.""" print "Processando arquivo %s..." % f corpus=CriaLeitorDeCorpus(arquivo=f) tokens=corpus.words() print "Quantidade de tokens: %d." % len(tokens) alfabeticas=ExtraiAlfabeticas(tokens) print "Quantidade de tokens alfabéticos: %d." % len(alfabeticas) freq=FreqDist(alfabeticas) print "Diversidade lexical: %.2f%%" % CalculaDiversidadeLexical(freq) print "Quantidade de hapaxes: %d.\n\n\n" % len(freq.hapaxes())
def count_words(filename): #Reads a file, counts the words, and saves the output in a dictionary infile = open(filename, 'r', encoding='utf-8') file = infile.read() infile.close() fdist = FreqDist(word.lower() for word in word_tokenize(file)) return len(fdist.hapaxes())
def replace_with_UNKING(tags_words): tags_words = tags_words fdist = FreqDist(words_train) hapaxes = fdist.hapaxes() for l in range(len(tags_words)): if tags_words[l][1] in hapaxes or tags_words[l][1] not in words_train: if tags_words[l][1].endswith("ing"): tags_words[l] = list(tags_words[l]) tags_words[l][1] = "UNK-ING" tags_words[l] = tuple(tags_words[l]) return tags_words
def replace_with_UNKAR(tags_words): tags_words = tags_words fdist = FreqDist(words_train) hapaxes = fdist.hapaxes() for l in range(len(tags_words)): if tags_words[l][1] in hapaxes or tags_words[l][1] not in words_train: if tags_words[l][1].endswith( "ar") and tags_words[l][1][0].islower(): tags_words[l] = list(tags_words[l]) tags_words[l][1] = "UNK-AR" tags_words[l] = tuple(tags_words[l]) return tags_words
def replace_with_UNKCAP(tags_words): tags_words = tags_words fdist = FreqDist(words_train) hapaxes = fdist.hapaxes() for l in range(len(tags_words)): if tags_words[l][1] in hapaxes or tags_words[l][1] not in words_train: if tags_words[l][1][0].isupper() and tags_words[l - 1][1] != "START": tags_words[l] = list(tags_words[l]) tags_words[l][1] = "UNK-CAP" tags_words[l] = tuple(tags_words[l]) return tags_words
def clean_reviews_by_label(files, label): """ Clean reviews from service words for one label and add bigrams. """ # word_matrix - list of all words and bigrams of common words # all_words - all unique words and all bigrams (most rare will be drop out) data = {'word_matrix': [], 'all_words': []} # temporary list of all words in all reviews and theit bigrams # (final list will be shorter as rare words will be removed) allwords = [] # create tokenizer tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+') n = len(files) for i, filepath in enumerate(files): if (i + 1) % 1000 == 0: print('{}: {}/{} docs processed'.format(label, i + 1, n)) # read review and tokenize it f = open(filepath) bag_words = tokenizer.tokenize(f.read()) f.close() # get part of speech for each word lower_words = get_part_of_speech(bag_words) # drop service words informative_words = choose_informative_words(lower_words) # form list of important words by words itself and bigrams tokens_bigrams_list = list( bigrams(informative_words)) + informative_words # add list of words in purpose to calculate frequency by document next data['word_matrix'].append(tokens_bigrams_list) # add words to big list of all words in all reviews allwords.extend(informative_words) # find frequencies for all words frequencies = FreqDist(allwords) # find the least frequent words hapaxes = frequencies.hapaxes() # remove them data['all_words'] = list(set(allwords) - set(hapaxes)) return {label: data}
ascii_tokens = [] for token in corpus_tokenized: try: token.decode('ascii') if not contains_digits(token): ascii_tokens.append(token) except: continue ascii_tokens_lowered = [] for token in ascii_tokens: ascii_tokens_lowered.append(token.lower()) fdist = FreqDist(ascii_tokens) fdist_lowered = FreqDist(ascii_tokens_lowered) hapaxes = fdist.hapaxes() print('Number of hapaxes before trimming: ' + str(len(hapaxes))) lowered_hapaxes = fdist_lowered.hapaxes() lowered_hapax_dict = {} for lowered_hapax in lowered_hapaxes: lowered_hapax_dict[lowered_hapax] = True tmp_hapaxes = [] # necessary because removing from hapaxes while looping through it caused subtle bug for hapax in hapaxes: # Remove hapaxes which are only hapaxes because of capitalization if hapax.lower() in lowered_hapax_dict: tmp_hapaxes.append(hapax) hapaxes = tmp_hapaxes print('Number of hapaxes after trimming: ' + str(len(hapaxes))) # Tweet a random hapax
from nltk.corpus import brown import matplotlib.pyplot as plot import pylab from math import log # Get the case insensitive words from the brown corpus case_inses_words = [word.lower() for word in brown.words()] no_of_tokens = len(case_inses_words) print("Total No of Tokens in Brown Corpus ", no_of_tokens) # Pass it on to FreqDist to get Frequency Distributions fdist = FreqDist(case_inses_words) print(fdist) # Compute the Percentage of Hapax Legomena's Occurrences and the longest in them hapax_legomenas = fdist.hapaxes() # Get the list of words that appeared just once in corpus hapax_legomena_counts = len(hapax_legomenas) # Get the count of them percentage_of_hapax_legomena = (hapax_legomena_counts/no_of_tokens)*100 # Compute percentage print("Percentage of Hapax Legomena Occurrences", percentage_of_hapax_legomena) max_len_happax_legomena = max([len(word) for word in hapax_legomenas]) print("Longest happax Legomena's are", [word for word in hapax_legomenas if len(word) == max_len_happax_legomena]) # Compute the Percentage of dis legomena Occurrences and the longest in them dis_legomenas = [key for key, value in fdist.items() if value == 2] # Get the words that occurred just twice dis_legomena_counts = len(dis_legomenas) * 2 # Get their counts percentage_of_dis_legomena = (dis_legomena_counts/no_of_tokens)*100 # Compute percentage print("Percentage of Dis Legomena Occurrences", percentage_of_dis_legomena) max_len_dis_legomena = max([len(word) for word in dis_legomenas]) print("Longest Dis Legomena's are ", [word for word in dis_legomenas if len(word) == max_len_dis_legomena]) # Plot the r vs Nr graph
def frequencyDistribution(tokenWords): freqWords = FreqDist(tokenWords) print(freqWords.most_common(10)) print(freqWords.hapaxes()) pt.plot() freqWords.plot(30, cumulative=True)
print() """ print('Frequency distributions') from nltk.probability import FreqDist print('FreqDist of Batman theme song') batman_fd = FreqDist( nltk.word_tokenize( 'na na na na na na na na na na na na na na na na Bat Man!')) print(batman_fd.most_common(5)) print() input() print('How about words that only occur once? ("singletons" or "hapaxes")') print(batman_fd.hapaxes()) print() input() print('What are the 50 most common words in Genesis?') fd3 = FreqDist(text3) print(fd3.most_common(50)) print() input() print('How frequent is "prayed" in Genesis?') # FreqDist objects are sub-types of dict print(fd3['prayed']) print()
class StyloDocument(object): DEFAULT_AUTHOR = "Unknown" def __init__(self, file_content, author=DEFAULT_AUTHOR): self.author = author.strip() self.raw_content = file_content self.file_content = file_content.lower() self.tokens = PortugueseTextualProcessing.tokenize(self.file_content) self.text = Text(self.tokens) self.fdist = FreqDist(self.text) self.sentences = sent_tokenize(self.file_content, language='portuguese') self.sentence_chars = [len(sent) for sent in self.sentences] self.sentence_word_length = [len(sent.split()) for sent in self.sentences] self.paragraphs = [p for p in self.file_content.split("\n\n") if len(p) > 0 and not p.isspace()] self.paragraph_word_length = [len(p.split()) for p in self.paragraphs] self.punctuation = [".", ",", ";", "-", ":"] self.ner_entities = ['ABSTRACCAO', 'ACONTECIMENTO', 'COISA', 'LOCAL', 'ORGANIZACAO', 'OBRA', 'OUTRO', 'PESSOA', 'TEMPO', 'VALOR'] self.white_spaces = len(self.file_content.split(' ')) self.rich_tags = RichTags(PortugueseTextualProcessing.get_rich_tags(self.file_content), len(self.text)) self.tagged_sentences = PortugueseTextualProcessing.postag(self.tokens) self.tagfdist = FreqDist([b for [(a, b)] in self.tagged_sentences]) self.ner_tags = PortugueseTextualProcessing.ner_chunks(self.tokens) self.ner_ftags = FreqDist(self.ner_tags) self.spell = SpellChecker(language='pt') self.ROUNDING_FACTOR = 4 self.LINE_BREAKS = ['\n', '\t', '\r'] def get_tag_count_by_start(self, tag_start): count = 0 for tag in self.tagfdist.keys(): if tag.startswith(tag_start): count += self.tagfdist[tag] return count def get_class_frequency_by_start(self, tag_start): return self.get_tag_count_by_start(tag_start)/self.tagfdist.N() def get_total_not_found(self): """"The wn is not being reliable so far""" nf_tokens = self.get_tokens_by_tag('notfound') return len([i for i in nf_tokens if len(wn.synsets(i, lang='por')) == 0]) def tag_frequency(self, tag): return self.tagfdist.freq(tag) def entity_frequency(self, tag): return self.ner_ftags.freq(tag) def get_tokens_by_tag(self, tag): return [i[0][0] for i in self.tagged_sentences if i[0][1] == tag] def get_long_sentence_freq(self): return (len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.LONG_SENTENCE_SIZE]))/len(self.sentences) def get_short_sentence_freq(self): return (len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.SHORT_SENTENCE_SIZE]))/len(self.sentences) def get_long_short_sentence_ratio(self): """"RF FOR PAN 15""" return len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.LONG_SENTENCE_SIZE])/(len([i for i in self.sentence_word_length if i < PortugueseTextualProcessing.SHORT_SENTENCE_SIZE])) def get_sentence_starting_tags_ratio(self, tag): count = [i[0][1] for i in self.tagged_sentences].count(tag) return count/len(self.sentences) def term_per_hundred(self, term): """ term X ----- = ------ N 100 """ return (self.fdist[term] * 100) / self.fdist.N() def mean_sentence_len(self): return np.mean(self.sentence_word_length) def std_sentence_len(self): return np.std(self.sentence_word_length) def mean_paragraph_len(self): return np.mean(self.paragraph_word_length) def std_paragraph_len(self): return np.std(self.paragraph_word_length) def flesh_index(self): idx, value = PortugueseTextualProcessing().get_ptBR_flesch_index(self.tokens, self.get_phrases()) return idx def vocabulary(self): return [v for v in sorted(set(self.sentences)) if v not in self.punctuation] def mean_word_len(self): words = set(word_tokenize(self.file_content, language='portuguese')) word_chars = [len(word) for word in words] return sum(word_chars) / float(len(word_chars)) def max_word_len(self): words = set(word_tokenize(self.file_content, language='portuguese')) return max([len(word) for word in words]) def type_token_ratio(self): return (len(set(self.text)) / len(self.text)) * 100 def unique_words_per_hundred(self): return self.type_token_ratio() / 100.0 * 100.0 / len(self.text) def document_len(self): return sum(self.sentence_chars) def get_phrases(self): return [i for i in self.file_content.split('.') if i != ''] def mean_syllables_per_word(self): _, syllable_count = PortugueseTextualProcessing().get_syllable_counts(self.tokens) return syllable_count/len(self.tokens) def characters_frequency(self, character_list): return self.frequency([word for word in self.file_content if word in character_list]) def digits_frequency(self): return self.frequency([word for word in self.file_content if word.isdigit()]) def line_breaks_frequency(self): return self.frequency([word for word in self.file_content if word in self.LINE_BREAKS]) def count_consonant_frequency(self): character_list = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'y', 'x', 'z'] return self.frequency([word for word in self.file_content if word in character_list]) def camel_case_frequency(self): return self.frequency([word for word in self.raw_content.split(' ') if word and word[0].isupper() and (len(word) == 1 or word[1].islower())]) def local_hapax_legommena_frequency(self): return (len(self.fdist.hapaxes()))/len(self.text.tokens) def collocations_frequency(self, size): """words that often appear consecutively in the window_size""" return (len(self.text.collocation_list(window_size=size)))/len(self.text.tokens) def most_frequent_word_size(self): return FreqDist(len(w) for w in self.text).max() def mean_frequent_word_size(self): return FreqDist(len(w) for w in self.text).most_common(3)[1][0] def guiraud_R_measure(self): return (len(set(self.text)))/math.sqrt(len(self.text)) def herdan_C_measure(self): # log V(N)/log N return (math.log2(len(set(self.text))))/math.log2(len(self.text)) def herdan_V_measure(self): # N ^ C return math.pow(len(self.text), self.herdan_C_measure()) def K_measure(self): # log V(N)/log(log(N)) return (math.log2(len(set(self.text)))) / math.log2(math.log2(len(self.text))) def dugast_U_measure(self): # log ^ 2 N/log(N) - log V(N) return (math.pow(math.log2(len(self.text)), 2)) / (math.log2(len(self.text)) - math.log2(len(set(self.text)))) def maas_A_measure(self): #a ^ 2 = logN - logV(N)/log ^ 2 N return math.sqrt((math.log2(len(self.text)) - math.log2(len(set(self.text)))) / math.pow(math.log2(len(self.text)), 2)) def LN_measure(self): # 1 - V(N) ^ 2/ V(N) ^ 2 log N return (1 - math.pow(len(set(self.text)),2)) / (math.pow(len(set(self.text)), 2) * math.log2(len(self.text))) def honores_H_measure(self): return (len(self.fdist.hapaxes()))/len(set(self.text)) def spell_miss_check_frequency(self): return self.frequency(self.spell.unknown(self.text)) def noun_phrases(self): return PortugueseTextualProcessing().get_number_of_noun_phrases(self.tokens) / len(self.text) def verb_phrases(self): return self.frequency(PortugueseTextualProcessing().get_number_of_verb_phrases(self.file_content)) def monosyllables(self): return PortugueseTextualProcessing().get_monosyllable_counts(self.tokens) / len(self.text) def repeated_words_frequency(self): repeated_words = list(filter(lambda x: x[1] >= 2, FreqDist(PortugueseTextualProcessing().remove_stopwords(self.tokens)).items())) return self.frequency(repeated_words) def stop_word_freq(self): clean_words = PortugueseTextualProcessing().remove_stopwords(self.tokens) return (len(self.tokens) - len(clean_words)) / len(self.text) def get_logical_operator_frequency(self): return self.frequency([token for token in self.tokens if token in PortugueseTextualProcessing.LOGICAL_OPERATORS]) def get_tags_freq(self, tags): count = 0 for tag in tags: count += self.get_tag_count_by_start(tag) return count/len(self.tokens) def find_quotes(self): """Improve this method to retrieve quotes based on Patterns and special words egs: p.43; segundo (autor, ano) """ return self.characters_frequency(['“', '”']) def frequency(self, input_values): return len(input_values) / len(self.text) @classmethod def csv_header(cls): return ( ['DiversidadeLexica', 'TamanhoMedioDasPalavras', 'TamanhoMedioSentencas', 'StdevSentencas', 'TamanhoMedioParagrafos', 'StdevTamParagrafos', 'FrequenciaDeParagrafos','FrequenciaPalavrasDuplicadas', 'MediaSilabasPorPalavra', 'Monossilabas', 'Ponto','Virgulas', 'Exclamacoes', 'DoisPontos', 'Citacoes', 'QuebrasDeLinha', 'Digitos', 'Adjetivos', 'Adverbios','Artigos', 'Substantivos', 'Preposicoes', 'Verbos','VerbosPtcp', 'Conjuncoes', 'Pronomes', 'PronomesPorPreposicao','TermosNaoTageados', 'PalavrasDeConteudo', 'PalavrasFuncionais', 'FrasesNominais', 'FrasesVerbais', 'GenMasc', 'GenFem', 'SemGenero', 'Singular', 'Plural', 'PrimeiraPessoa', 'TerceiraPessoa','Passado','Presente','Futuro', 'TotalEntidadesNomeadas', 'EntAbstracao', 'EntAcontecimento', 'EntCoisa', 'EntLocal', 'EntOrganizacao', 'EntObra', 'EntOutro', 'EntPessoa', 'EntTempo', 'EntValor', 'GuiraudR', 'HerdanC', 'HerdanV', 'MedidaK', 'DugastU', 'MaasA', 'HonoresH', 'PalavrasErroOrtografico', 'HapaxLegomenaLocal', 'PalavrasComunsTam2', 'PalavrasComunsTam3', 'PalavrasComunsTam4', 'StopWords', 'BRFleshIndex', 'OperadoresLogicos', 'PalavrasCapitalizadas', 'Author'] ) def csv_output(self): # TODO: Separate features into syntactical, lexical and so on.. # 69 features + 1 class return "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \ "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \ "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}," \ "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},'{}'".format( # Text style features - 10 round(self.type_token_ratio(), self.ROUNDING_FACTOR), round(self.mean_word_len(), self.ROUNDING_FACTOR), round(self.mean_sentence_len(), self.ROUNDING_FACTOR), round(self.std_sentence_len(), self.ROUNDING_FACTOR), round(self.mean_paragraph_len(), self.ROUNDING_FACTOR), round(self.std_paragraph_len(), self.ROUNDING_FACTOR), len(self.paragraphs) / len(self.text), round(self.repeated_words_frequency(), self.ROUNDING_FACTOR), self.mean_syllables_per_word(), self.monosyllables(), # Term count features - 7 self.term_per_hundred('.'), self.term_per_hundred(','), self.term_per_hundred('!'), self.term_per_hundred(':'), self.find_quotes(), self.line_breaks_frequency(), self.digits_frequency(), #POSTAG Features - 24 self.tag_frequency('ADJ'), self.tag_frequency('ADV'), self.tag_frequency('ART'), self.tag_frequency('N'), self.tag_frequency('PREP'), self.tag_frequency('PCP'), # verbo no participio self.get_class_frequency_by_start('V'), self.get_class_frequency_by_start('K'), #conjunções self.get_class_frequency_by_start('PRO'), self.get_class_frequency_by_start('PRO')/self.tag_frequency('PREP'), #used in french texts self.tag_frequency('notfound'), self.get_tags_freq(PortugueseTextualProcessing.CONTENT_TAGS), self.get_tags_freq(PortugueseTextualProcessing.FUNCTIONAL_TAGS), round(self.noun_phrases(), self.ROUNDING_FACTOR), round(self.verb_phrases(), self.ROUNDING_FACTOR), self.rich_tags.get_male(), self.rich_tags.get_female(), self.rich_tags.get_unspecified_gender(), self.rich_tags.get_singular(), self.rich_tags.get_plural(), self.rich_tags.get_first_person(), self.rich_tags.get_third_person(), self.rich_tags.get_past_tense(), self.rich_tags.get_present_tense(), self.rich_tags.get_future_tense(), #NER Features - 11 round(len(self.ner_tags) / len(self.tokens), self.ROUNDING_FACTOR), self.entity_frequency('ABSTRACCAO'), self.entity_frequency('ACONTECIMENTO'), self.entity_frequency('COISA'), self.entity_frequency('LOCAL'), self.entity_frequency('ORGANIZACAO'), self.entity_frequency('OBRA'), self.entity_frequency('OUTRO'), self.entity_frequency('PESSOA'), self.entity_frequency('TEMPO'), self.entity_frequency('VALOR'), # Vocabulary diversity features - 7 round(self.guiraud_R_measure(), self.ROUNDING_FACTOR), round(self.herdan_C_measure(), self.ROUNDING_FACTOR), round(self.herdan_V_measure(), self.ROUNDING_FACTOR), round(self.K_measure(), self.ROUNDING_FACTOR), round(self.dugast_U_measure(), self.ROUNDING_FACTOR), round(self.maas_A_measure(), self.ROUNDING_FACTOR), round(self.honores_H_measure(), self.ROUNDING_FACTOR), # Misc Features - 9 self.spell_miss_check_frequency(), round(self.local_hapax_legommena_frequency(), self.ROUNDING_FACTOR), self.collocations_frequency(2), self.collocations_frequency(3), self.collocations_frequency(4), round(self.stop_word_freq(), self.ROUNDING_FACTOR), self.flesh_index(), self.get_logical_operator_frequency(), self.camel_case_frequency(), self.author, ) def legacy_features(self): """Remove features that are here for future reference""" # self.count_characters_frequency(['a']), # self.count_characters_frequency(['e']), # self.count_characters_frequency(['i']), # self.count_characters_frequency(['o']), # self.count_characters_frequency(['u']), # self.count_consonant_frequency(), # self.mean_frequent_word_size(), # self.max_word_len(), # self.document_len(), # round(self.LN_measure(), 8) pass
stemmed_list_porter = [porter.stem(t) for t in token_union] stemmed_list_lancaster = [lancaster.stem(t) for t in token_union] stemmed_list_snowball = [snowball.stem(t) for t in token_union] df = pandas.DataFrame( data={ 'original_token': original_token_list, 'porter': stemmed_list_porter, 'lancaster': stemmed_list_lancaster, 'snowball': stemmed_list_snowball }) # Task 4 freq_dist = FreqDist(casual_tokenize(file_str)) most_common = freq_dist.most_common(10) hapaxes = freq_dist.hapaxes() freq_dist.plot() # Se observa, in primul rand, ca majoritatea cuvintelor sunt hapaxe din cauza ca corpus-ul este mic. # Din acelasi motiv, se observa 'alunecari' din ce in ce mai vizible apropiindu-ne de y=1. # Daca am fi avut un corpus gigantic, graficul ar fi apropiat de o functie liniara la aparenta. # Din grafic se observa ca fie avem cuvinte foarte des intalnite, fie avem hapaxe, ceea ce indica ca avem un corpus mic. # Task 5 pos = pos_tag(casual_tokenize(file_str)) pos_dict = {tag: [] for (word, tag) in pos} for (word, tag) in pos: pos_dict[tag].append(word) pos_tag_fd = FreqDist(tag for (word, tag) in pos).most_common()
' '.join(['Monty' ,'Python']) #join list to str 'Monty Python'.split() #split str to list import matplotlib.pyplot as plt import pandas as pd from nltk.probability import FreqDist fdist_moby = FreqDist(moby_dick) #frequency distribution fdist_bible = FreqDist(bible) fdist_chat = FreqDist(chat) print(fdist_moby.most_common(10)) fdist_moby.plot(50, cumulative=True); plt.show() print(fdist_bible.most_common(10)) print(fdist_chat.most_common(10)) print(len(fdist_moby.hapaxes())) #words that occur once only long_words = [word for word in moby_dick if len(word) >= 15] #long words long_words = sorted(set([w for w in chat if len(w) > 6 and fdist_chat[w] > 7])) #long words w/ more conditions #length of word and numer of occurences via FreqDist from nltk import * print(bigrams(['more', 'is' , 'said', 'than', 'done'])) print('; '.join(chat.collocation_list())) #common bigrams fmoby = FreqDist([len(w) for w in moby_dick]) #freq dist of lenghts of words! print(fmoby.items()) #keys-values : length, apperances of words #page 44 for more functions! #page 45 for word comparison operators
"González volvieron a estallar las redes con un beso “No se logró ni funcionó porque él no quiso”, " \ "Lina Tejeiro sobre su relación con Andy Rivera Estos son los estrenos que trae Netflix en julio " \ "“El cemento puede esperar, la prioridades contener la pandemia”: Mello Castro Fiesta y concurso de ‘El más " \ "comelón’ en La Paz durante el toque de queda Envían a la cárcel a encargados de laboratorio de coca " \ "en Chimichagua Investigan cerco epidemiológico de primer caso de covid-19 en Manaure Cierran barrios de " \ "Riohacha donde se presentan mayores brotes de la covid-19 Casas de apuestas en Colombia, un negocio en " \ "constante auge Conozca Skrill, una de las plataformas más reconocidas para comprar criptomonedas Falla " \ "mundial en WhatsApp: no muestra última conexión Pasos para descargar WhatsApp Plus gratis" tokenizer = nltk.RegexpTokenizer(r"\w+") new_words = tokenizer.tokenize(text) print(new_words) tokenized_word = word_tokenize(text) print(tokenized_word) fdist = FreqDist(new_words) print(fdist) filtered_sent = [] for w in new_words: if w not in stop_words: filtered_sent.append(w) print("Tokenized Sentence:", new_words) print("Filterd Sentence:", filtered_sent) fdist = FreqDist(filtered_sent) print(fdist['días']) print(fdist.hapaxes()) print(fdist.most_common(10))
text = [] #Aggregate all tokens into list for line in file_source: line = line.lower() # print TweetTokenizer().tokenize(line) text = text + TweetTokenizer().tokenize(line) #Create frequency distribution fdist = FreqDist(text) total_tokens = fdist.N() unique_tokens = fdist.B() #Print distribution properties print "The number of total tokens:", total_tokens print "The number of unique tokens:", unique_tokens print "The type/token ratio:", (unique_tokens + 0.0) / total_tokens print "Number of tokes that only appear once:", len(fdist.hapaxes()) print "\nTokens that only appear once:" print "=======================" for w in fdist.hapaxes(): print w print "\nThe most common tokens:" print "=======================" for x in fdist.most_common(150000): w, n = x out = str(n) + '\t' + w print out file_dist.write(out + '\n')