def find_best_words(positiveWords, negativWords, dimention_num): scoreF = BigramAssocMeasures.chi_sq posBigrams = BCF.from_words(positiveWords).nbest(scoreF, 5000) negBigrams = BCF.from_words(negativWords).nbest(scoreF, 5000) pos = positiveWords + posBigrams neg = negativWords + negBigrams all_words = pos + neg word_fd = FreqDist(all_words) pos_word_fd = FreqDist(pos) neg_word_fd = FreqDist(neg) pos_word_count = pos_word_fd.N() neg_word_count = neg_word_fd.N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(pos_word_fd[word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(neg_word_fd[word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best_vals = sorted(word_scores, key=lambda k: word_scores[k], reverse=True)[:dimention_num] return best_vals
def __FreqFromCorpus(self): r""" Questo metodo estrae le frequenze dal corpus """ print "Calcolo bigrams..." bi = FreqDist(bigrams(self.words)) print "Calcolo FreqDist..." wfr = FreqDist(self.words) print "Coda di elaborazione..." print tot = len(bi.keys()) i = 0 for eles in bi.keys(): a = wfr[eles[0]] b = wfr[eles[1]] ab = bi[eles] N = wfr.N() try: self.__col_logl.append( nltk.tokenize.punkt.PunktTrainer()._col_log_likelihood( a, b, ab, N)) print "elemento %d / %d \t -> \tloglikelihood di %s %s \t\t -> %f" % ( i, tot, eles[0], eles[1], self.__col_logl[-1]) except UnicodeEncodeError: #catturo eventuali errori di codifica pass i += 1
def output(Token): freq = FreqDist(Token) freq Scenario_1_20 = [] for pos, frequency in freq.most_common(freq.N())[0:20]: Scenario_1_20.append(pos) print(pos, '---', frequency)
def treatSentences(self): allText = "" tokenizer = RegexpTokenizer(r'\w+') for s in self.rawSentences: allText += s.sent.lower() + '.' tokenizedText = tokenizer.tokenize(allText.lower()) filtered_tokenized_words = [] for word in tokenizedText: if word not in stopwords.words('english'): if word != "@card@": filtered_tokenized_words.append(word) fdist_words = FreqDist(filtered_tokenized_words) self.fdist_dict = dict(fdist_words.most_common(fdist_words.N())) filtered_tokenized_sentences = [] for s in self.rawSentences: filtered_tokenized_sentences.append( sentence([ word for word in tokenizer.tokenize(s.sent.lower()) if word not in stopwords.words('english') ], s.number, s.doc)) for filtered_sentence in filtered_tokenized_sentences: self.tokenized_sentences.append( sentence( sorted(filtered_sentence.list, key=lambda x: self.fdist_dict.get(x), reverse=True), filtered_sentence.number, filtered_sentence.doc))
def textToLFP(sentences, step=1000, last=2000): '''we are not lowercasing, tokenizing, removing stopwords, numerals etc. this is because we are looking into algorithmic bias and as such into the effect of the algorithm on the text it is offered. The text is already tokenized. Might add Lowercasing too.''' #create Frequency Dictionary fdist = FreqDist(" ".join(sentences).split( )) # our text is already tokenized. We merge all sentences together # and create one huge list of tokens. # get size range end = last + step sizes = list(range(0, end, step)) #Get words for every frequency band freqs = [ fdist.most_common(size + step)[size:size + step] for size in sizes[:-1] ] freqs.append(fdist.most_common()[last:]) #total tokens totalCount = fdist.N() #percentage frequency band percs = [ sum([count for (_word, count) in freq]) / totalCount for freq in freqs ] #plot #plot_freqdist_freq(fdist, 20) return percs
def wordfreq(text, write): fw = open(write, 'w') fdist = FreqDist() freq = {} freq['UNK'] = [0, 0.0] linecnt = 0 #Get lines count from text file for line in text.readlines(): linecnt += 1 text.seek(0) #Determine held out training set heldout = linecnt * .1 #Train non-held out set for l in range(0, linecnt - int(heldout)): line = text.readline() for word in TweetTokenizer().tokenize(line): fdist[word] += 1 #Train held out set for line in text.readlines(): for word in TweetTokenizer().tokenize(line): if fdist[word] != 0: fdist[word] += 1 else: fdist['UNK'] += 1 #Determine conditional probabilities for word in fdist.most_common(): prob = float(fdist[word[0]]) / fdist.N() freq[word[0]] = [fdist[word[0]], prob] fw.write(word[0] + ' ' + str(fdist[word[0]]) + ' ' + str(prob) + '\n') return freq
def bigram_graph(tokens, numwords, words_title): """Using tokens, output word frequency and bigram plots tokens: Ordered list of word tokens numwords: Int, number of entries for x-axis words_title: Title of graph """ # Finding the frequency distinct in the tokens fdist = FreqDist(tokens) total = fdist.N() for word in fdist: fdist[word] /= float(total) fdist.plot(numwords, cumulative=False, title=words_title) # fdist.tabulate() # Create bigrams bigrams = create_bigrams(tokens) # Compute frequency distribution for all the bigrams in the text fdist_bgs = nltk.FreqDist(bigrams) total2 = fdist_bgs.N() for word in fdist_bgs: fdist_bgs[word] /= float(total2) fdist_bgs.plot(numwords, cumulative=False, title=words_title)
def CalculateBestWordsStarRating(corpus, number_of_words): # Create frequency distributions for later word_fd = FreqDist() label_word_fd = ConditionalFreqDist() # For each document in the corpus for document in corpus: # Split out of the words from the label words = document[0] label = document[1] # For each word in the document for word in words: # Split off the word and frequency token, frequency = word.split(":") # Add the word to the distribution equal to the number of times it # occurs in the document for i in range(int(frequency)): word_fd[token.lower()] += 1 label_word_fd[label][token.lower()] += 1 # Figure out the number of words that belong to each label one_word_count = label_word_fd['1.0'].N() two_word_count = label_word_fd['2.0'].N() four_word_count = label_word_fd['4.0'].N() five_word_count = label_word_fd['5.0'].N() total_word_count = one_word_count + two_word_count + four_word_count + five_word_count word_scores = {} # This computes the probability that a word is in a given class, for each class for word, freq in word_fd.most_common(word_fd.N()): #print(word) #print(label_word_fd['3.0'][word], freq, total_word_count) one_score = BigramAssocMeasures.chi_sq(label_word_fd['1.0'][word], (freq, one_word_count), total_word_count) two_score = BigramAssocMeasures.chi_sq(label_word_fd['2.0'][word], (freq, two_word_count), total_word_count) four_score = BigramAssocMeasures.chi_sq(label_word_fd['4.0'][word], (freq, four_word_count), total_word_count) five_score = BigramAssocMeasures.chi_sq(label_word_fd['5.0'][word], (freq, five_word_count), total_word_count) word_scores[word] = one_score + two_score + four_score + five_score # This sorts the list of words by their score and retrieves the number equal # to the parameter number_of_words best = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)[:number_of_words] best_words = set([w for w, s in best]) return best_words
def get_most_common_ngrams(self, n, nb_ngrams=None): """ Compute and return the set of the most common ngrams in the documents. This set is cached inside the object. Args: n: The number of grams. Must be a positive interger. nb_ngrams: The number of ngrams to return, i.e quantifying the 'most'. Returns: A list of the most common ngrams. """ try: # return cached value return self._most_common_ngrams[n] except KeyError: pass # compute all ngrams all_ngrams = [] for document in self.training_set["hits"]["hits"]: if document["_source"]["external_review_report"] is not None: all_ngrams.extend(self.compute_ngrams(document["_source"]["external_review_report"], n)) if document["_source"]["external_review_form"] is not None: all_ngrams.extend(self.compute_ngrams(document["_source"]["external_review_form"], n)) # get the frequency or return all ngrams freq = FreqDist(ngram for ngram in all_ngrams) # store and return the nb_ngrams most common ngrams word_scores = {} if nb_ngrams: self._most_common_ngrams[n] = freq.keys()[:nb_ngrams] for word, freqs in freq.iteritems(): score = BigramAssocMeasures.chi_sq(freq[word], (freqs, freq.N()), freq.N() + freq.N()) word_scores[word] = score self.best = [] self.best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:n] self.bestwords = set([w for w, s in self.best]) else: self._most_common_ngrams[n] = freq.keys() return self.bestwords #self._most_common_ngrams[n]
def bigramAnalysis(self): label_word_fd = ConditionalFreqDist() word_fd = FreqDist() datafiles = [ { 'emo': "Sad", 'name': "/negative.csv" }, { 'emo': "Happy", 'name': "/positive.csv" } # , {'emo': 'Happy', 'name': "/trust.csv"}, {'emo': 'Sad', 'name': "/anger.csv"} ] for value in datafiles: emo = value['emo'] name = value['name'] read = self.readFile(name) normalized_sentences = [s.lower() for s in read['tweets']] for statement in normalized_sentences: for word in statement.split(): wor = word.lower() if word not in stopset: word_fd[word] += 1 label_word_fd[emo][word] += 1 # word_fd.inc(word.lower()) word_scores = {} pos_word_count = label_word_fd['Happy'].N() neg_word_count = label_word_fd['Sad'].N() total_word_count = word_fd.N() for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( label_word_fd['Happy'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['Sad'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:500] self.bestwords = set([w for w, s in best]) print("\n\nevaluating best word features") self.unigramAnalysis(self.best_word_feats) print("\n\nBigram + bigram chi_sq word ") self.unigramAnalysis(self.best_bigram_word_feats)
def calculo_frecuencias(bag_of_words): """Calcula frecuencias de las palabras y muestra una gráfica con las más frecuentes Args: bag_of_words: lista de strings """ freq_dist = FreqDist(bag_of_words) print("Nº. objetos: %d" % freq_dist.N()) print("Nº. objetos únicos: %d" % freq_dist.B()) print("El objeto más frecuente es: %s" % str(freq_dist.max())) freq_dist.plot(50)
def normalized_sorted_frequency_distribution(words): fqdist = FreqDist(words) N = fqdist.N() sorted_keys_of_fqdist = sorted(fqdist, key=fqdist.get, reverse=True) normalized_fqdist = OrderedDict() for key in sorted_keys_of_fqdist: normalized_fqdist[key] = (fqdist[key] / (N + 0.0)) return normalized_fqdist
def generate_unigram_model(corpus,vocab): fd=FreqDist(corpus) n=fd.N() keys=fd.keys() d2={} for k in keys: if k in vocab: d2[k]=fd[k] d2['<unk>']=n-sum(d2.values()) for k in d2.keys(): d2[k]=d2[k]/n with open(dir+'mask/unigram_model.json', 'w', encoding='utf8') as f: json.dump(d2,f)
def binary_stump(feature_name, feature_value, labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() # Find the best label for each value. pos_fdist = FreqDist() neg_fdist = FreqDist() for featureset, label in labeled_featuresets: if featureset.get(feature_name) == feature_value: pos_fdist[label] += 1 else: neg_fdist[label] += 1 decisions = {} default = label # But hopefully we have observations! if pos_fdist.N() > 0: decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())} if neg_fdist.N() > 0: default = DecisionTreeClassifier(neg_fdist.max()) return DecisionTreeClassifier(label, feature_name, decisions, default)
def CalculateBestWords(corpus): # Create frequency distributions for later word_fd = FreqDist() label_word_fd = ConditionalFreqDist() # For each document in the corpus for document in corpus: # Split out of the words from the label words = document[0] label = document[1] # For each word in the document for word in words: # Split off the word and frequency token, frequency = word.split(":") # Add the word to the distribution equal to the number of times it # occurs in the document for i in range(int(frequency)): word_fd[token.lower()] += 1 label_word_fd[label][token.lower()] += 1 # Figures out the number of words that apply to each label pos_word_count = label_word_fd['positive'].N() neg_word_count = label_word_fd['negative'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} # This computes the probability that a word is in a given class, for each class for word, freq in word_fd.most_common(word_fd.N()): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score # This sorts the list of words by their score and retrieves the 5000 best words best = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)[:5000] best_words = set([w for w, s in best]) return best_words
def get_topic_freq(text, topic): # Parse article words = word_tokenize(text) filtered_words = [w for w in words if w not in to_filter] stemmed_words = [pstem.stem(w) for w in filtered_words] fd = FreqDist(stemmed_words) # Frequency of topic word in article topic_freq = fd[pstem.stem(topic)] try: topic_density = topic_freq / fd.N() except ZeroDivisionError: topic_density = 0 return pd.Series({ "topic_freq": topic_freq, "topic_density": topic_density })
def __init__(self, file): self.tokenized_sentences = [] #Opening file and replacing carriage return by space brexit_text = file.read().replace('\n', ' ') #Initializing tokenizer tokenizer = RegexpTokenizer(r'\w+') #Initializing Stemmer (not supported yet) ps = PorterStemmer() #Tokenizing sentences tokenized_words = tokenizer.tokenize(brexit_text.lower()) self.sentences = nltk.sent_tokenize(brexit_text.lower()) filtered_tokenized_words = [] filtered_tokenized_sentences = [] #Removing stopwords from text for word in tokenized_words: if word not in stopwords.words('english'): filtered_tokenized_words.append(ps.stem(word)) for b_sentence in self.sentences: filtered_tokenized_sentences.append([ ps.stem(word) for word in tokenizer.tokenize(b_sentence) if word not in stopwords.words('english') ]) #Creating the fdist dictionnary fdist_words = FreqDist(filtered_tokenized_words) self.fdist_dict = dict(fdist_words.most_common(fdist_words.N())) for k in self.fdist_dict.keys(): print(k + "," + str(self.fdist_dict[k])) i = 0 for filtered_sentence in filtered_tokenized_sentences: self.tokenized_sentences.append( sentence( sorted(filtered_sentence, key=lambda x: self.fdist_dict.get(x), reverse=True), i, file)) i += 1
def getCumulativePercentage(tags, topic, plot): fdist1 = FreqDist(tags) freq = fdist1.most_common(N_MOST_FREQUENT) freqwords = [seq[0] for seq in freq] frequencies = [seq[1] for seq in freq] total = fdist1.N() x = list(range(N_MOST_FREQUENT)) percentages = [freq / float(total) for freq in frequencies] cs = np.cumsum(percentages) if plot: plt.rc('xtick', labelsize=LABEL_SIZE) plt.xticks(x, freqwords) locs, labels = plt.xticks() plt.setp(labels, rotation=90) plt.gcf().subplots_adjust(bottom=0.4) plt.plot(x, percentages) plt.title('Accumulative percentage of tags covered by the most ' + str(N_MOST_FREQUENT) + " frequent tags in " + topic) plt.plot(x, cs, 'r--') plt.show()
def best_word_feats(tweets, labels): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() tokenizer = TweetTokenizer() tweets = [tokenizer.tokenize(tweet) for tweet in tweets] for tweet, label in zip(tweets, labels): for word in tweet: word_fd[word.lower()] += 1 if label == 0: label_word_fd['0'][word.lower()] += 1 else: label_word_fd['4'][word.lower()] += 1 total_word_count = word_fd.N() pos_word_count = label_word_fd['4'].N() neg_word_count = label_word_fd['0'].N() word_scores = {} for (word, freq) in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['4'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['0'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best_words = [ word for (word, score ) in sorted(word_scores.items(), key=itemgetter(1), reverse=True) ][:50000] return best_words
class AddAlphaBigramModel(): def __init__(self, alpha=0.1): self.vocabulary=set() self.V = 0 self.bigrams=ConditionalFreqDist([]) self.unigrams=FreqDist([]) self.alpha = 0.1 def train(self): self.vocabulary=set() this_bigrams=[] self.unigrams = FreqDist([]) for fileid in gutenberg.fileids(): for sentence in gutenberg.sents(fileid): words=["<s>",] + [x.lower() for x in sentence if wordRE.search(x)] + ["</s>",] this_bigrams += bigrams(words) self.vocabulary.update(words) self.unigrams.update(words) self.bigrams=ConditionalFreqDist(this_bigrams) self.V = len(self.vocabulary) def bigram_prob(self, w1, w2): numerator = self.bigrams[w1][w2] + self.alpha denominator = self.bigrams[w1].N() + (self.alpha * self.V) retval= math.log(numerator / denominator) return retval def unigram_prob(self, w): numerator = self.unigrams[w] + self.alpha denominator = self.unigrams.N() + (self.alpha * self.V) return math.log(numerator/denominator) def __contains__(self, w): return w in self.vocabulary
def getFreqDist(self): fieldnames = ['Word','Frequency'] with open(self.csvfile, 'wb') as csvf: writer = csv.DictWriter(csvf, fieldnames=fieldnames) writer.writeheader() text=self.text #set stopwords stopwords = set(nltk.corpus.stopwords.words('english')) words=word_tokenize(text) #remove words if length of word is not over 1 (i.e. punctuation) words = [word for word in words if len(word) > 1] #remove numbers words = [word for word in words if not word.isnumeric()] #make all words lowercase words = [word.lower() for word in words] #remove stopwords words = [word for word in words if word not in stopwords] fdist= FreqDist(words) #number of all words print ('Total number of samples: %i' % fdist.N()) #number of all distinct words print ('Total number of bins: %i' % fdist.B()) #write all bins and count into CSV file for word, frequency in fdist.most_common(fdist.B()): writer.writerow({'Word':word,'Frequency': frequency})
class TnT(TaggerI): """ TnT - Statistical POS tagger IMPORTANT NOTES: * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS - It is possible to provide an untrained POS tagger to create tags for unknown words, see __init__ function * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT - Due to the nature of this tagger, it works best when trained over sentence delimited input. - However it still produces good results if the training data and testing data are separated on all punctuation eg: [,.?!] - Input for training is expected to be a list of sentences where each sentence is a list of (word, tag) tuples - Input for tag function is a single sentence Input for tagdata function is a list of sentences Output is of a similar form * Function provided to process text that is unsegmented - Please see basic_sent_chop() TnT uses a second order Markov model to produce tags for a sequence of input, specifically: argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) IE: the maximum projection of a set of probabilities The set of possible tags for a given word is derived from the training data. It is the set of all tags that exact word has been assigned. To speed up and get more precision, we can use log addition to instead multiplication, specifically: argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] + log(P(t_T+1|t_T)) The probability of a tag for a given word is the linear interpolation of 3 markov models; a zero-order, first-order, and a second order model. P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + l3*P(t_i| t_i-1, t_i-2) A beam search is used to limit the memory usage of the algorithm. The degree of the beam can be changed using N in the initialization. N represents the maximum number of possible solutions to maintain while tagging. It is possible to differentiate the tags which are assigned to capitalized words. However this does not result in a significant gain in the accuracy of the results. """ def __init__(self, unk=None, Trained=False, N=1000, C=False): """ Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk: TaggerI :param Trained: Indication that the POS tagger is trained or not :type Trained: bool :param N: Beam search degree (see above) :type N: int :param C: Capitalization flag :type C: bool Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger """ self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0 def train(self, data): """ Uses a set of tagged data to train the tagger. If an unknown word tagger is specified, it is trained on the same data. :param data: List of lists of (word, tag) tuples :type data: tuple(str) """ # Ensure that local C flag is initialized before use C = False if self._unk is not None and self._T == False: self._unk.train(data) for sent in data: history = [("BOS", False), ("BOS", False)] for w, t in sent: # if capitalization is requested, # and the word begins with a capital # set local flag C to True if self._C and w[0].isupper(): C = True self._wd[w][t] += 1 self._uni[(t, C)] += 1 self._bi[history[1]][(t, C)] += 1 self._tri[tuple(history)][(t, C)] += 1 history.append((t, C)) history.pop(0) # set local flag C to false for the next word C = False self._eos[t]["EOS"] += 1 # compute lambda values from the trained frequency distributions self._compute_lambda() def _compute_lambda(self): """ creates lambda values based upon training data NOTE: no need to explicitly reference C, it is contained within the tag variable :: tag == (tag,C) for each tag trigram (t1, t2, t3) depending on the maximum value of - f(t1,t2,t3)-1 / f(t1,t2)-1 - f(t2,t3)-1 / f(t2)-1 - f(t3)-1 / N-1 increment l3,l2, or l1 by f(t1,t2,t3) ISSUES -- Resolutions: if 2 values are equal, increment both lambda values by (f(t1,t2,t3) / 2) """ # temporary lambda variables tl1 = 0.0 tl2 = 0.0 tl3 = 0.0 # for each t1,t2 in system for history in self._tri.conditions(): (h1, h2) = history # for each t3 given t1,t2 in system # (NOTE: tag actually represents (tag,C)) # However no effect within this function for tag in self._tri[history].keys(): # if there has only been 1 occurrence of this tag in the data # then ignore this trigram. if self._uni[tag] == 1: continue # safe_div provides a safe floating point division # it returns -1 if the denominator is 0 c3 = self._safe_div((self._tri[history][tag] - 1), (self._tri[history].N() - 1)) c2 = self._safe_div((self._bi[h2][tag] - 1), (self._bi[h2].N() - 1)) c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1)) # if c1 is the maximum value: if (c1 > c3) and (c1 > c2): tl1 += self._tri[history][tag] # if c2 is the maximum value elif (c2 > c3) and (c2 > c1): tl2 += self._tri[history][tag] # if c3 is the maximum value elif (c3 > c2) and (c3 > c1): tl3 += self._tri[history][tag] # if c3, and c2 are equal and larger than c1 elif (c3 == c2) and (c3 > c1): tl2 += self._tri[history][tag] / 2.0 tl3 += self._tri[history][tag] / 2.0 # if c1, and c2 are equal and larger than c3 # this might be a dumb thing to do....(not sure yet) elif (c2 == c1) and (c1 > c3): tl1 += self._tri[history][tag] / 2.0 tl2 += self._tri[history][tag] / 2.0 # otherwise there might be a problem # eg: all values = 0 else: pass # Lambda normalisation: # ensures that l1+l2+l3 = 1 self._l1 = tl1 / (tl1 + tl2 + tl3) self._l2 = tl2 / (tl1 + tl2 + tl3) self._l3 = tl3 / (tl1 + tl2 + tl3) def _safe_div(self, v1, v2): """ Safe floating point division function, does not allow division by 0 returns -1 if the denominator is 0 """ if v2 == 0: return -1 else: return v1 / v2 def tagdata(self, data): """ Tags each sentence in a list of sentences :param data:list of list of words :type data: [[string,],] :return: list of list of (word, tag) tuples Invokes tag(sent) function for each sentence compiles the results into a list of tagged sentences each tagged sentence is a list of (word, tag) tuples """ res = [] for sent in data: res1 = self.tag(sent) res.append(res1) return res def tag(self, data): """ Tags a single sentence :param data: list of words :type data: [string,] :return: [(word, tag),] Calls recursive function '_tagword' to produce a list of tags Associates the sequence of returned tags with the correct words in the input sequence returns a list of (word, tag) tuples """ current_state = [(["BOS", "BOS"], 0.0)] sent = list(data) tags = self._tagword(sent, current_state) res = [] for i in range(len(sent)): # unpack and discard the C flags (t, C) = tags[i + 2] res.append((sent[i], t)) return res def _tagword(self, sent, current_states): """ :param sent : List of words remaining in the sentence :type sent : [word,] :param current_states : List of possible tag combinations for the sentence so far, and the log probability associated with each tag combination :type current_states : [([tag, ], logprob), ] Tags the first word in the sentence and recursively tags the reminder of sentence Uses formula specified above to calculate the probability of a particular tag """ # if this word marks the end of the sentence, # return the most probable tag if sent == []: (h, logp) = current_states[0] return h # otherwise there are more words to be tagged word = sent[0] sent = sent[1:] new_states = [] # if the Capitalisation is requested, # initialise the flag for this word C = False if self._C and word[0].isupper(): C = True # if word is known # compute the set of possible tags # and their associated log probabilities if word in self._wd: self.known += 1 for (history, curr_sent_logprob) in current_states: logprobs = [] for t in self._wd[word].keys(): tC = (t, C) p_uni = self._uni.freq(tC) p_bi = self._bi[history[-1]].freq(tC) p_tri = self._tri[tuple(history[-2:])].freq(tC) p_wd = self._wd[word][t] / self._uni[tC] p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri p2 = log(p, 2) + log(p_wd, 2) # compute the result of appending each tag to this history new_states.append((history + [tC], curr_sent_logprob + p2)) # otherwise a new word, set of possible tags is unknown else: self.unknown += 1 # since a set of possible tags, # and the probability of each specific tag # can not be returned from most classifiers: # specify that any unknown words are tagged with certainty p = 1 # if no unknown word tagger has been specified # then use the tag 'Unk' if self._unk is None: tag = ("Unk", C) # otherwise apply the unknown word tagger else: [(_w, t)] = list(self._unk.tag([word])) tag = (t, C) for (history, logprob) in current_states: history.append(tag) new_states = current_states # now have computed a set of possible new_states # sort states by log prob # set is now ordered greatest to least log probability new_states.sort(reverse=True, key=itemgetter(1)) # del everything after N (threshold) # this is the beam search cut if len(new_states) > self._N: new_states = new_states[:self._N] # compute the tags for the rest of the sentence # return the best list of tags for the sentence return self._tagword(sent, new_states)
def plot_words(wordList): fDist = FreqDist(wordList) #print(fDist.most_common()) print("单词总数: ", fDist.N()) print("不同单词数: ", fDist.B()) fDist.plot(10)
print(lexical_diversity(text3)) print(lexical_diversity(text5)) print(percentage(4, 5)) print(percentage(text4.count('a'), len(text4))) # %% fdist1 = FreqDist(text1) fdist1 vocabulary1 = fdist1.keys() print(vocabulary1) print(fdist1['whale']) # %% fdist1.plot(50, cumulative=True) # %% list(fdist1.items())[0:5] # %% fdist1.freq('monstrous') # %% # Total number of samples fdist1.N() # %% fdist1 # %%
corpus = Token(TEXT=open('dados/may2001_pdf.torto').read()) print corpus WhitespaceTokenizer().tokenize(corpus) print corpus for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) # How many times did "the" occur? freq_dist.count('the') # What was the frequency of the word "the"? freq_dist.freq('the') # How many word tokens were counted? freq_dist.N() # What word types were encountered? freq_dist.samples() # What was the most common word? freq_dist.max() # What is the distribution of word lengths in a corpus? freq_dist = FreqDist() for token in corpus['SUBTOKENS']: freq_dist.inc(len(token['TEXT'])) # Plot the results. wordlens = freq_dist.samples()
tokens = [word for word in tokens if ('*' not in word) and \ ("''" != word) and ("``" != word) and \ (word!='description') and (word !='dtype') \ and (word != 'object') and (word!="'s")] print("\nDocument contains a total of", len(tokens), " terms.") token_num = FreqDist(tokens) for pos, frequency in token_num.most_common(20): print('{:<15s}:{:>4d}'.format(pos, frequency)) #POS Tagging tagged_tokens = nltk.pos_tag(tokens) pos_list = [word[1] for word in tagged_tokens if word[1] != ":" and \ word[1] != "."] pos_dist = FreqDist(pos_list) pos_dist.plot(title="Parts of Speech") for pos, frequency in pos_dist.most_common(pos_dist.N()): print('{:<15s}:{:>4d}'.format(pos, frequency)) # Removing stop words stop = stopwords.words('english') + list(string.punctuation) stop_tokens = [word for word in tagged_tokens if word[0] not in stop] # Removing single character words and simple punctuation stop_tokens = [word for word in stop_tokens if len(word) > 1] # Removing numbers and possive "'s" stop_tokens = [word for word in stop_tokens \ if (not word[0].replace('.','',1).isnumeric()) and \ word[0]!="'s" ] token_dist = FreqDist(stop_tokens) print("\nCorpus contains", len(token_dist.items()), \ " unique terms after removing stop words.\n") for word, frequency in token_dist.most_common(20):
from nltk.classify import scikitlearn from collections import defaultdict from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs a = defaultdict(set) b = ["a", "b", "c", "d", "d"] f = FreqDist(b) a["horton"].add(8) a["horton"].add(6) print a["horton"] print f.N()
for (word, count) in fdist.iteritems(): if word not in freq_dist_background: freq_dist_background[word] = count else: freq_dist_background[word] += count freq_dist_background_sum += count except KeyError: pass print >> sys.stderr, "\r%d / %d" % (i, len(data)), i += 1 for team in data: try: fdist = freq_dists[(team['name'], team['year'])] for w in fdist.iterkeys(): fdist[w] = (fdist[w] / float(fdist.N())) / ( freq_dist_background[w] / float(freq_dist_background_sum)) words = fdist.keys() words.sort(lambda x, y: cmp(fdist[x], fdist[y])) team['topwords'] = { word: fdist[word] for word in words[0:options.numberwords] } except KeyError: pass print >> sys.stderr, "\n", if options.outfile == '-': outfile = sys.stdout else:
fdist.values fdist.values() fdist.values().sum() sum(fdist.values()) fdist['delicious'] / sum(fdist.values()) fdist['disgusting'] / sum(fdist.values()) fdist['disgusting'] fdist['vegetarian'] fdist['old-timey'] fdist['healthy'] fdist['expensive'] print text print(text) fdist.freq('delicious') fdist.freq('delicnotehu') fdist.N() fdist ? fdist? fdist.freq('Delicious') fdist fdist.freq('rainy') Business.where_raw('') Business.where_raw('latitude <= 40.75') Business.where_raw('latitude <= 40.75').count() Business.where_raw('latitude <= 40.75 and latitude > 40.749') Business.where_raw('latitude <= 40.75 and latitude > 40.749').count Business.where_raw('latitude <= 40.75 and latitude > 40.749').count() lat = 40.71 lon = -74.01 Business.where_raw('latitude', '>=', lat).where('latitude', '<', lat + 0.001) Business.where_raw('latitude', '>=', lat).where('latitude', '<', lat + 0.001).count()
class StyloDocument(object): def __init__(self, file_name, author=DEFAULT_AUTHOR): self.doc = open(file_name, "r").read().decode(encoding='utf-8', errors='ignore') self.author = author self.file_name = file_name self.tokens = word_tokenize(self.doc) self.text = Text(self.tokens) self.fdist = FreqDist(self.text) self.sentences = sent_tokenize(self.doc) self.sentence_chars = [ len(sent) for sent in self.sentences] self.sentence_word_length = [ len(sent.split()) for sent in self.sentences] self.paragraphs = [p for p in self.doc.split("\n\n") if len(p) > 0 and not p.isspace()] self.paragraph_word_length = [len(p.split()) for p in self.paragraphs] @classmethod def csv_header(cls): return ( 'Author,Title,LexicalDiversity,MeanWordLen,MeanSentenceLen,StdevSentenceLen,MeanParagraphLen,DocumentLen,' 'Commas,Semicolons,Quotes,Exclamations,Colons,Dashes,Mdashes,' 'Ands,Buts,Howevers,Ifs,Thats,Mores,Musts,Mights,This,Verys' ) def term_per_thousand(self, term): """ term X ----- = ------ N 1000 """ return (self.fdist[term] * 1000) / self.fdist.N() def mean_sentence_len(self): return np.mean(self.sentence_word_length) def std_sentence_len(self): return np.std(self.sentence_word_length) def mean_paragraph_len(self): return np.mean(self.paragraph_word_length) def std_paragraph_len(self): return np.std(self.paragraph_word_length) def mean_word_len(self): words = set(word_tokenize(self.doc)) word_chars = [ len(word) for word in words] return sum(word_chars) / float(len(word_chars)) def type_token_ratio(self): return (len(set(self.text)) / len(self.text)) * 100 def unique_words_per_thousand(self): # total = 0 # num_iters = 100 # for i in range(num_iters): # start = random.randint(0,len(self.text)-1000) # sub_text = self.text[random.randint(0,len(self.text)-1000):] # total += (len(set(sub_text)) / float(len(sub_text)))*100 # return total/float(num_iters) return self.type_token_ratio()/100.0*1000.0 / len(self.text) def document_len(self): return sum(self.sentence_chars) def csv_output(self): return '"%s","%s",%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g,%g' % ( self.author, self.file_name, self.type_token_ratio(), self.mean_word_len(), self.mean_sentence_len(), self.std_sentence_len(), self.mean_paragraph_len(), self.document_len(), self.term_per_thousand(','), self.term_per_thousand(';'), self.term_per_thousand('"'), self.term_per_thousand('!'), self.term_per_thousand(':'), self.term_per_thousand('-'), self.term_per_thousand('--'), self.term_per_thousand('and'), self.term_per_thousand('but'), self.term_per_thousand('however'), self.term_per_thousand('if'), self.term_per_thousand('that'), self.term_per_thousand('more'), self.term_per_thousand('must'), self.term_per_thousand('might'), self.term_per_thousand('this'), self.term_per_thousand('very'), ) def text_output(self): print "##############################################" print "" print "Name: ", self.file_name print "" print ">>> Phraseology Analysis <<<" print "" print "Lexical diversity :", self.type_token_ratio() print "Mean Word Length :", self.mean_word_len() print "Mean Sentence Length :", self.mean_sentence_len() print "STDEV Sentence Length :", self.std_sentence_len() print "Mean paragraph Length :", self.mean_paragraph_len() print "Document Length :", self.document_len() print "" print ">>> Punctuation Analysis (per 1000 tokens) <<<" print "" print 'Commas :', self.term_per_thousand(',') print 'Semicolons :', self.term_per_thousand(';') print 'Quotations :', self.term_per_thousand('\"') print 'Exclamations :', self.term_per_thousand('!') print 'Colons :', self.term_per_thousand(':') print 'Hyphens :', self.term_per_thousand('-') # m-dash or n-dash? print 'Double Hyphens :', self.term_per_thousand('--') # m-dash or n-dash? print "" print ">>> Lexical Usage Analysis (per 1000 tokens) <<<" print "" print 'and :', self.term_per_thousand('and') print 'but :', self.term_per_thousand('but') print 'however :', self.term_per_thousand('however') print 'if :', self.term_per_thousand('if') print 'that :', self.term_per_thousand('that') print 'more :', self.term_per_thousand('more') print 'must :', self.term_per_thousand('must') print 'might :', self.term_per_thousand('might') print 'this :', self.term_per_thousand('this') print 'very :', self.term_per_thousand('very') print ''