Ejemplo n.º 1
0
    def train_sklearn_classifier(self):
        if not self.feature_words:
            self._read_csv()
            self._generate_word_features()

        shuffle(self.documents)
        feature_sets = [(self.__document_features(tok), lab)
                        for (tok, lab) in self.documents]

        cutoff = int(len(feature_sets) * self.test_ratio)
        train_set, test_set = feature_sets[cutoff:], feature_sets[:cutoff]
        print("Totals({0}) Training({1}) Test({2})".format(
            len(feature_sets), len(train_set), len(test_set)))

        self.classifier = DecisionTreeClassifier.train(test_set)

        print('Achieved {0:.2f}% accuracy against training set'.format(
            nltk.classify.accuracy(self.classifier, train_set) * 100))
        print('Achieved {0:.2f}% accuracy against test set'.format(
            nltk.classify.accuracy(self.classifier, test_set) * 100))
Ejemplo n.º 2
0
        vocabulary.add(word)


def format_dataset(fileids, featureSet):
    dataset = list()
    for fileid in fileids:
        features = dict()
        for word in featureSet:
            features[word] = word in movie_reviews.words(fileid)
        pos_or_neg = fileid[:3]
        example = (features, pos_or_neg)
        dataset.append(example)
    return dataset


# Get the datasets ready
train_set = format_dataset(train_fileids, vocabulary)
test_set = format_dataset(test_fileids, vocabulary)

# Create some classifiers
from nltk.classify.decisiontree import DecisionTreeClassifier
tree = DecisionTreeClassifier.train(train_set)

from nltk.classify.naivebayes import NaiveBayesClassifier
bayes = NaiveBayesClassifier.train(train_set)

# Test the classifiers
from nltk.classify import accuracy
print("Decision Tree accuracy: ", accuracy(tree, test_set))
print("Naive Bayes accuracy: ", accuracy(bayes, test_set))
def f(x):
	return DecisionTreeClassifier.train(x, binary=True, verbose=True)
Ejemplo n.º 4
0
                    ('bag_of_words',
                     Pipeline([
                         ('selector', FeatureGetter(train_entries)),
                         ('vect',
                          CountVectorizer(ngram_range=(1, 1),
                                          max_features=500)),
                         ('tfidf', TfidfTransformer(use_idf=True)),
                     ])),
                    # Weights for the features
                ],
                transformer_weights={
                    'feature_1': 1.5,
                    'feature_2': 0.5,
                    'feature_3': 0.5,
                    'feature_4': 1.5,
                },
            )),
        # Decision tree used as classifier
        ('clf', DecisionTreeClassifier(max_features=500)),
    ])

pipeline_pickle_path = 'data/pipeline.pickle'
pickle.dump(pipeline, open(pipeline_pickle_path, 'wb'))
print("About to run the pipeline...")
train_data = sp.convertDataToList(sp.train)
test_data = sp.convertDataToList(sp.train)
train_entries, train_langs = sp.returnEntriesWithSpoken(train_data)
test_entries, test_langs = sp.returnEntriesWithSpoken(test_data)
pipeline.fit(train_entries, train_langs)
y = pipeline.predict(train_entries)
print(classification_report(y, test_langs))
	if (len(elem[0]) > 3):
		negative.append(elem)

combined = list(set(positive + negative))
feature_set = []
for i, elem in enumerate(total_text):
	# elem is the words in a document
	features = {}
	for word, number in combined:
		features['contains({})'.format(word)] = (word in elem)
	pos_or_neg = "positive" if (stars_array[i] >= 3) else "negative"
	feature_set.append((features, pos_or_neg))

print "Finished Featurization. It took " + str(time.time() - start_time) + " to run."

cutoff = int(len(feature_set) * 3/4)
training = feature_set[:cutoff]
testing = feature_set[cutoff:]

classifier = DecisionTreeClassifier.train(training, binary=True, verbose=True)

count = 0
for elem in testing:
	val = classifier.classify(elem[0])
	if val == elem[1]:
		count += 1

print float(count)/len(testing)