Esempio n. 1
0
def get_label_probdist(labelled_features):
    label_fd = FreqDist()
    for item, counts in labelled_features.items():
        for label in labels.values():
            if counts[label] > 0:
                label_fd.inc(label)

    label_probdist = ELEProbDist(label_fd)
    return label_probdist
Esempio n. 2
0
    def train(self, X_training_data):
        from nltk.probability import ELEProbDist
        estimator = lambda fdist, bins: ELEProbDist(fdist)
        self.__model = hmm.HiddenMarkovModelTrainer()
        self.__tagger = self.__model.train_supervised(X_training_data,
                                                      estimator=estimator)

        # predicted_y = self.__model.predict(X_training_data['data_tfidf'])
        # print(np.mean(predicted_y == X_training_data['labels']))
        pass
Esempio n. 3
0
	def CreatNaiveBayes(self, data):
		

		label_freqdist = FreqDist() 
   
		for (name, total, ethList) in data:
			for i in range(5):
				label_freqdist[self._ethicity[i]] += ethList[i]

		label_probdist = ELEProbDist(label_freqdist) 
		feature_freqdist = defaultdict(FreqDist)
		feature_values = defaultdict(set)
		#for (name, total, ethList) in data:

		#	x-lets
		for (name, total, ethList) in data:
			x_lets = self.get3_let(name)
			for i in range(5):
				for x_let in x_lets:
					feature_freqdist[(self._ethicity[i], x_let)][True] += ethList[i]
					feature_values[x_let].add(True)

		for ((label, x_let), freqdist) in feature_freqdist.items():
			num = 0
			for i in range(5):
				if label == self._ethicity[i]:
					num = i
					break
			tot = 0
			for (name, total, ethList) in data:
				if x_let not in name:
					tot += ethList[num]
					feature_values[x_let].add(None)
			if tot > 0:
				feature_freqdist[(label, x_let)][None] += tot;
				

		feature_probdist = {}
		for ((label, fname), freqdist) in feature_freqdist.items():
			probdist = ELEProbDist(freqdist, bins=len(feature_values[fname]))
			feature_probdist[label, fname] = probdist
		
		self.classifier = nltk.NaiveBayesClassifier(label_probdist, feature_probdist)
Esempio n. 4
0
def get_feature_probdist(labelled_features):
    feature_freqdist = defaultdict(FreqDist)
    feature_values = defaultdict(set)
    num_samples = samplecount
    for token, counts in labelled_features.items():
        for label in labels.values():
            feature_freqdist[label, token].inc(True, count=counts[label])
            feature_freqdist[label, token].inc(None,
                                               num_samples - counts[label])
            feature_values[token].add(None)
            feature_values[token].add(True)

    feature_probdist = {}
    for ((label, fname), freqdist) in feature_freqdist.items():
        probdist = ELEProbDist(freqdist, bins=len(feature_values[fname]))
        feature_probdist[label, fname] = probdist
    return feature_probdist
Esempio n. 5
0
    def _generateFeatureProbabilityDistribution(self):
        frequencyDistributions = defaultdict(FreqDist)
        values = defaultdict(set)
        numberSamples = len(self._trainingset) / 2

        for token, counts in self._featuresset.items():
            for label in self._labels:
                frequencyDistributions[label, token].inc(True,
                                                         count=counts[label])
                frequencyDistributions[label, token].inc(
                    None, numberSamples - counts[label])
                values[token].add(None)
                values[token].add(True)

        probabilityDistribution = {}
        for ((label, name), freqDist) in frequencyDistributions.items():
            eleProbDist = ELEProbDist(freqDist, bins=len(values[name]))
            probabilityDistribution[label, name] = eleProbDist

        self._featureProbabilityDistribution = probabilityDistribution
Esempio n. 6
0
freq_dist_uni = nltk.FreqDist(tokenized_text)
print("Most common 10 unigram: ", freq_dist_uni.most_common(10), "\n",
      "least common 3 words: ",
      freq_dist_uni.most_common()[-3:], "\n")

prob_distArray = []
prob_dist_uni = MLEProbDist(freq_dist_uni)
for s in prob_dist_uni.samples():
    prob_distArray.append(prob_dist_uni.prob(s))
i = 0
for lim in freq_dist_uni.most_common(10):
    print(lim, prob_distArray[i])
    i += 1

elep = ELEProbDist(freq_dist_uni)
for s in elep.samples():
    prob_distArray.append(elep.prob(s))
i = 0
for lim in freq_dist_uni.most_common(10):
    print(lim, prob_distArray[i], "\n")
    i += 1

uniqueWords = len(set(tokenized_text))
print("Unique Words: ", uniqueWords, "\n")

bigram_count = bigrams(tokenized_text)
counts = nltk.Counter(bigram_count)
print("Bigram Count: ", counts, "\n", "Most Common 10 bigram: ",
      counts.most_common(10), "\n", "Least Common 3 words: ",
      counts.most_common()[-3:], "\n")
def save_unigram(words):
    unigram = ELEProbDist(FreqDist(words))
    with open(save_base_dir + 'unigram.pkl', 'wb') as f:
        pickle.dump(unigram, f)
    return unigram
Esempio n. 8
0
def _load_ele_pdist(data: dict, version: int) -> ELEProbDist:
    return ELEProbDist(data['freqdist'], bins=data['bins'])