def get_label_probdist(labelled_features): label_fd = FreqDist() for item, counts in labelled_features.items(): for label in labels.values(): if counts[label] > 0: label_fd.inc(label) label_probdist = ELEProbDist(label_fd) return label_probdist
def train(self, X_training_data): from nltk.probability import ELEProbDist estimator = lambda fdist, bins: ELEProbDist(fdist) self.__model = hmm.HiddenMarkovModelTrainer() self.__tagger = self.__model.train_supervised(X_training_data, estimator=estimator) # predicted_y = self.__model.predict(X_training_data['data_tfidf']) # print(np.mean(predicted_y == X_training_data['labels'])) pass
def CreatNaiveBayes(self, data): label_freqdist = FreqDist() for (name, total, ethList) in data: for i in range(5): label_freqdist[self._ethicity[i]] += ethList[i] label_probdist = ELEProbDist(label_freqdist) feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) #for (name, total, ethList) in data: # x-lets for (name, total, ethList) in data: x_lets = self.get3_let(name) for i in range(5): for x_let in x_lets: feature_freqdist[(self._ethicity[i], x_let)][True] += ethList[i] feature_values[x_let].add(True) for ((label, x_let), freqdist) in feature_freqdist.items(): num = 0 for i in range(5): if label == self._ethicity[i]: num = i break tot = 0 for (name, total, ethList) in data: if x_let not in name: tot += ethList[num] feature_values[x_let].add(None) if tot > 0: feature_freqdist[(label, x_let)][None] += tot; feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = ELEProbDist(freqdist, bins=len(feature_values[fname])) feature_probdist[label, fname] = probdist self.classifier = nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
def get_feature_probdist(labelled_features): feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) num_samples = samplecount for token, counts in labelled_features.items(): for label in labels.values(): feature_freqdist[label, token].inc(True, count=counts[label]) feature_freqdist[label, token].inc(None, num_samples - counts[label]) feature_values[token].add(None) feature_values[token].add(True) feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = ELEProbDist(freqdist, bins=len(feature_values[fname])) feature_probdist[label, fname] = probdist return feature_probdist
def _generateFeatureProbabilityDistribution(self): frequencyDistributions = defaultdict(FreqDist) values = defaultdict(set) numberSamples = len(self._trainingset) / 2 for token, counts in self._featuresset.items(): for label in self._labels: frequencyDistributions[label, token].inc(True, count=counts[label]) frequencyDistributions[label, token].inc( None, numberSamples - counts[label]) values[token].add(None) values[token].add(True) probabilityDistribution = {} for ((label, name), freqDist) in frequencyDistributions.items(): eleProbDist = ELEProbDist(freqDist, bins=len(values[name])) probabilityDistribution[label, name] = eleProbDist self._featureProbabilityDistribution = probabilityDistribution
freq_dist_uni = nltk.FreqDist(tokenized_text) print("Most common 10 unigram: ", freq_dist_uni.most_common(10), "\n", "least common 3 words: ", freq_dist_uni.most_common()[-3:], "\n") prob_distArray = [] prob_dist_uni = MLEProbDist(freq_dist_uni) for s in prob_dist_uni.samples(): prob_distArray.append(prob_dist_uni.prob(s)) i = 0 for lim in freq_dist_uni.most_common(10): print(lim, prob_distArray[i]) i += 1 elep = ELEProbDist(freq_dist_uni) for s in elep.samples(): prob_distArray.append(elep.prob(s)) i = 0 for lim in freq_dist_uni.most_common(10): print(lim, prob_distArray[i], "\n") i += 1 uniqueWords = len(set(tokenized_text)) print("Unique Words: ", uniqueWords, "\n") bigram_count = bigrams(tokenized_text) counts = nltk.Counter(bigram_count) print("Bigram Count: ", counts, "\n", "Most Common 10 bigram: ", counts.most_common(10), "\n", "Least Common 3 words: ", counts.most_common()[-3:], "\n")
def save_unigram(words): unigram = ELEProbDist(FreqDist(words)) with open(save_base_dir + 'unigram.pkl', 'wb') as f: pickle.dump(unigram, f) return unigram
def _load_ele_pdist(data: dict, version: int) -> ELEProbDist: return ELEProbDist(data['freqdist'], bins=data['bins'])
def _dump_ele_pdist(pdist: ELEProbDist) -> dict: return { 'freqdist': pdist.freqdist(), 'bins': pdist._bins, }