コード例 #1
0
    def train_sklearn_classifier(self):
        if not self.feature_words:
            self._read_csv()
            self._generate_word_features()

        shuffle(self.documents)
        feature_sets = [(self.__document_features(tok), lab)
                        for (tok, lab) in self.documents]

        cutoff = int(len(feature_sets) * self.test_ratio)
        train_set, test_set = feature_sets[cutoff:], feature_sets[:cutoff]
        print("Totals({0}) Training({1}) Test({2})".format(
            len(feature_sets), len(train_set), len(test_set)))

        self.classifier = DecisionTreeClassifier.train(test_set)

        print('Achieved {0:.2f}% accuracy against training set'.format(
            nltk.classify.accuracy(self.classifier, train_set) * 100))
        print('Achieved {0:.2f}% accuracy against test set'.format(
            nltk.classify.accuracy(self.classifier, test_set) * 100))
コード例 #2
0
def f(x):
	return DecisionTreeClassifier.train(x, binary=True, verbose=True)
コード例 #3
0
        vocabulary.add(word)


def format_dataset(fileids, featureSet):
    dataset = list()
    for fileid in fileids:
        features = dict()
        for word in featureSet:
            features[word] = word in movie_reviews.words(fileid)
        pos_or_neg = fileid[:3]
        example = (features, pos_or_neg)
        dataset.append(example)
    return dataset


# Get the datasets ready
train_set = format_dataset(train_fileids, vocabulary)
test_set = format_dataset(test_fileids, vocabulary)

# Create some classifiers
from nltk.classify.decisiontree import DecisionTreeClassifier
tree = DecisionTreeClassifier.train(train_set)

from nltk.classify.naivebayes import NaiveBayesClassifier
bayes = NaiveBayesClassifier.train(train_set)

# Test the classifiers
from nltk.classify import accuracy
print("Decision Tree accuracy: ", accuracy(tree, test_set))
print("Naive Bayes accuracy: ", accuracy(bayes, test_set))
コード例 #4
0
	if (len(elem[0]) > 3):
		negative.append(elem)

combined = list(set(positive + negative))
feature_set = []
for i, elem in enumerate(total_text):
	# elem is the words in a document
	features = {}
	for word, number in combined:
		features['contains({})'.format(word)] = (word in elem)
	pos_or_neg = "positive" if (stars_array[i] >= 3) else "negative"
	feature_set.append((features, pos_or_neg))

print "Finished Featurization. It took " + str(time.time() - start_time) + " to run."

cutoff = int(len(feature_set) * 3/4)
training = feature_set[:cutoff]
testing = feature_set[cutoff:]

classifier = DecisionTreeClassifier.train(training, binary=True, verbose=True)

count = 0
for elem in testing:
	val = classifier.classify(elem[0])
	if val == elem[1]:
		count += 1

print float(count)/len(testing)