def _get_features(f_tweets): _feature_vector = [] pos_features_dist = [] neg_features_dist = [] neutral_features_dist = [] for token, label in f_tweets: if label == 'positive': pos_features_dist.extend(token) elif label == 'negative': neg_features_dist.extend(token) else: neutral_features_dist.extend(token) pos_features_dist = probability.FreqDist(pos_features_dist) for key, value in pos_features_dist.iteritems(): _feature_vector.append(({key: value}, 'positive')) neg_features_dist = probability.FreqDist(neg_features_dist) for key, value in neg_features_dist.iteritems(): _feature_vector.append(({key: value}, 'negative')) neutral_features_dist = probability.FreqDist(neutral_features_dist) for key, value in neutral_features_dist.iteritems(): _feature_vector.append(({key: value}, 'neutral')) #print "_feature_vector",'\n',_feature_vector,"\n" #[({'car': 1}, 'positive')] return _feature_vector
def find_naive_v1(self, min_size): frequencies = prob.FreqDist() for index in range(len(self.klass_values) - 1): frequencies.inc(self.klass_values[index]) if frequencies[frequencies.max()] >= min_size: self.append(index) frequencies = prob.FreqDist()
def adjust_for_min_freq(self, min_size): prev = -1 self.sort() to_remove,frequencies = [], prob.FreqDist() for breakpoint in self.data: frequencies.inc(self.klass_values[breakpoint], breakpoint - prev) if frequencies[frequencies.max()] < min_size: to_remove.append(breakpoint) else: frequencies = prob.FreqDist() prev = breakpoint for item in to_remove: self.remove(item)
def savelocalfd(self): self.localdist = dict() for doc in self.docs: localfd = probability.FreqDist() for tok in doc.tokens(): localfd.inc(tok) self.localfd[doc.fid] = localfd
def transform(self, query): ''' transform function transfom the query to words and its frequency. Attributes: query (str) : query trasformation. requide return: dict ''' # step 1 : drop special char query = re.sub('[^A-Za-z.]+', ' ', query) # Step 2 : tokenize query = word_tokenize(query) # step 3 : droping char len < 1 query = [i for i in query if len(i) > 1] # step 4 : count prob query = probability.FreqDist(query) # step 6 : convert for search query = dict(query) return query
def doTheThing(fileContents): # TOKENIZATION tokenizedWords = tokenize.word_tokenize(fileContents) # STOPWORDS filteredWords = [] stop_words = set(corpus.stopwords.words('english')) for w in tokenizedWords: if w not in stop_words: filteredWords.append(w) # FREQUENCY DISTRIBUTION freqDist = probability.FreqDist(tokenizedWords) # STEMING ps = stem.PorterStemmer() stemmedWords = [] for w in filteredWords: stemmedWords.append(ps.stem(w)) # LEMMATIZATION wnl = stem.WordNetLemmatizer() lemmatizedWords = [] for w in filteredWords: lemmatizedWords.append(wnl.lemmatize(w, "v")) return [ tokenizedWords, filteredWords, freqDist, stemmedWords, lemmatizedWords ]
def buildFreqMap(self, text): freq_dict = probability.FreqDist() tokens = self.tokenize(text) stop_words_removed = self.remove_stop_words(tokens) for word in self.stem_text(stop_words_removed): freq_dict.inc(word.lower()) return freq_dict
def train(cmd_args, corpus_files, model): """ Trains statistical model. """ for lang in corpus_files: text = udhr2.raw(lang) #print("lang:", lang, "; length:", len(text)) # Replace multiple whitespaces (including ' ', '\n', '\t') with just one ' ' text = re.sub(r'\s+', ' ', text) # Skip empty files, like nku.txt if len(text) < 1000: #print("skipping pathological file", lang) model.deleted_langs.append(lang) continue model.ngrams[lang] = {} model.smoothed[lang] = [] if cmd_args.cross_valid: # Remove the first 100 characters to go to the test set model.tests[lang] = text[:cmd_args.test_len] text = text[cmd_args.test_len:] # Build ngrams for each language in training model.ngrams[lang] = char_freqs(text, cmd_args.n_order) model.smoothed[lang] = probability.LaplaceProbDist( probability.FreqDist(model.ngrams[lang]))
def distribution(self, tokens, laplace=True): fd = probability.FreqDist() for word in tokens: fd.inc(word) if laplace: return probability.LaplaceProbDist(fd) else: return probability.MLEProbDist(fd)
def count_stems(corpus): fd = probability.FreqDist() for word in corpus.words(): w = word.lower() if w in stopset: continue fd.inc(stemmer.stem(w)) return fd
def count_hypernyms(corpus): fd = probability.FreqDist() for word in corpus.words(): w = word.lower() if w in stopset: continue for syn in wordnet.synsets(w): if syn.pos != 'n': continue for path in syn.hypernym_paths(): for hyp in path: fd.inc(hyp.name) return fd
def savelocaldist(self, laplace = True, savetokens = False): self.localdist = dict() for doc in self.docs: if savetokens: doc.terms = [] localfd = probability.FreqDist() for tok in doc.tokens(): if savetokens: doc.terms.append(tok) localfd.inc(tok) if localfd.N() > 0: if laplace: self.localdist[doc.fid] = probability.LaplaceProbDist(localfd) else: self.localdist[doc.fid] = probability.MLEProbDist(localfd)
def __init__(self, unk=None, Trained=False, N=1000, C=False): ''' Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. @param unk: instance of a POS tagger, conforms to TaggerI @type unk:(TaggerI) @param Trained: Indication that the POS tagger is trained or not @type Trained: boolean @param N: Beam search degree (see above) @type N:(int) @param C: Capitalization flag @type C: boolean Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger ''' self._uni = probability.FreqDist() self._bi = probability.ConditionalFreqDist() self._tri = probability.ConditionalFreqDist() self._wd = probability.ConditionalFreqDist() self._eos = probability.ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0
def doTheThing(fileContents, mode): result = [] # TOKENIZATION if mode >= 0: tokenizedWords = tokenize.word_tokenize(fileContents) print('Tokenization...') result.append(tokenizedWords) # STOPWORDS if mode >= 1: print('Stopwords...') filteredWords=[] stop_words = set(get_stop_words('polish')) for w in tokenizedWords: if w not in stop_words: filteredWords.append(w) result.append(filteredWords) # FREQUENCY DISTRIBUTION if mode >= 2: print('FrequencyDistribution...') freqDist = probability.FreqDist(filteredWords) result.append( freqDist ) # STEMING if mode >= 3: print('Stemming...') ps = stem.PorterStemmer() stemmedWords = [] for w in filteredWords: stemmedWords.append(ps.stem(w)) result.append(stemmedWords) # LEMMATIZATION if mode >= 4: print('Lemmanization...') wnl = stem.WordNetLemmatizer() lemmatizedWords = [] for w in filteredWords: lemmatizedWords.append(wnl.lemmatize(w, "v")) result.append(lemmatizedWords) return result
def globaldist(self, laplace=True): ''' return a global probabiliyt distribution for a set of document. Memory problem if the set of document is too large. Use laplace smooting by default. Creates a storage gdist which holds the global dist Must clear this variable after use to free the memory ''' fd = probability.FreqDist() for doc in self.docs: tokens = None if doc.terms is None: tokens = doc.tokens() else: tokens = doc.terms for tok in tokens: fd.inc(tok) if laplace: self.gdist = probability.LaplaceProbDist(fd) else: self.gdist = probability.MLEProbDist(fd) return self.gdist
import nltk.probability as p import nltk.tokenize as tk from nltk.corpus import stopwords file_path = '../../DataSets/Test/DE_EN_(tatoeba)_test.txt' text = '' with open(file_path, 'r', encoding='utf-8') as file: for line in file: text += line.split('\t')[0].lower().replace('.', '').replace('?', '') + ' ' blob = tk.casual_tokenize(text, strip_handles=True) better_blob = [] stop_words = set(stopwords.words('english')) for word in blob: if not (len(word) <= 3 or word == 'mary' or word == "tom's" or word == "mary's"): better_blob.append(word) filtered_blob = list(filter(lambda w: not w in stop_words, better_blob)) heu = p.FreqDist(filtered_blob).most_common(100) for i in range(len(heu)): print(heu[i])
def class_distribution(base_path): training = format.C45_FORMAT.get_training_instances(base_path) freq_dist = probability.FreqDist() for each in training: freq_dist.inc(each.klass_value) return freq_dist
def word_counts(words): return dict(probability.FreqDist((w, 1) for w in words))
import nltk import nltk.probability as pro br = nltk.corpus.brown freq = pro.FreqDist([ w for cat in br.categories() for w in br.words(categories=cat)]) word3 = [w for w in freq.keys() if freq[w]>3] print('长度大于3的词共有',len(word3),'个')
# -*- coding: utf-8 -*- # tmeEvijv.py import nltk, re, operator from nltk.book import * from nltk import probability from nltk.corpus import udhr ######################## Exercice 1 ################################# text5Freq = probability.FreqDist(text5) sortedList = sorted(text5Freq.items(), key=operator.itemgetter(1), reverse=True) mots4lettres = [w[0] for w in sortedList if len(w[0]) == 4] #print mots4lettres ######################## Exercice 2 ################################# wordsHat = [] wordsZ = [] wordsPT = [] for i in set(text6): reHat = re.search('.*(?i)hat$', i) if reHat != None: wordsHat.append(reHat.group()) reZ = re.search('.*(?i)z.*', i) if reZ != None: wordsZ.append(reZ.group())
comment.replace(" ", "") comment = (' ').join(comment.split()) texts.append(comment) texts = texts[1:] texts = texts + train_y_sentence texts_sentences = [] pattern = re.compile('([a-z \,]+\.)') for x in texts: results = pattern.findall(comment) for j in results: texts_sentences.append(j) starting = pb.FreqDist() transitional = pb.ConditionalFreqDist() emissional = pb.ConditionalFreqDist() pi = pb.FreqDist() for row in test_y_sentence: pi[row[0]] += 1 for row in texts_sentences: lasts = None for ch in list(row): if (lasts is not None): transitional[lasts][ch] += 1 lasts = ch for row in train_data:
def majority_klass_vote(instances): fd = prob.FreqDist() for each in instances: fd.inc(each.klass_value) return fd.max()
def empty_freq_dists(self): return dict([(value, prob.FreqDist()) for value in self.values])
def fd(self, tokens): fd = probability.FreqDist() for term in tokens: fd.inc(term) return fd
def entropy_of_key_counts(dictionary): freq_dist = prob.FreqDist() klasses = dictionary.keys() for klass in klasses: freq_dist.inc(klass, dictionary[klass]) return entropy_of_freq_dist(freq_dist)
def entropy(values): freq_dist = prob.FreqDist() for value in values: freq_dist.inc(value) return entropy_of_freq_dist(freq_dist)
def class_freq_dist(self): class_freq_dist = prob.FreqDist() for instance in self.data: class_freq_dist.inc(instance.klass_value) return class_freq_dist