def train_sklearn_classifier(self): if not self.feature_words: self._read_csv() self._generate_word_features() shuffle(self.documents) feature_sets = [(self.__document_features(tok), lab) for (tok, lab) in self.documents] cutoff = int(len(feature_sets) * self.test_ratio) train_set, test_set = feature_sets[cutoff:], feature_sets[:cutoff] print("Totals({0}) Training({1}) Test({2})".format( len(feature_sets), len(train_set), len(test_set))) self.classifier = DecisionTreeClassifier.train(test_set) print('Achieved {0:.2f}% accuracy against training set'.format( nltk.classify.accuracy(self.classifier, train_set) * 100)) print('Achieved {0:.2f}% accuracy against test set'.format( nltk.classify.accuracy(self.classifier, test_set) * 100))
vocabulary.add(word) def format_dataset(fileids, featureSet): dataset = list() for fileid in fileids: features = dict() for word in featureSet: features[word] = word in movie_reviews.words(fileid) pos_or_neg = fileid[:3] example = (features, pos_or_neg) dataset.append(example) return dataset # Get the datasets ready train_set = format_dataset(train_fileids, vocabulary) test_set = format_dataset(test_fileids, vocabulary) # Create some classifiers from nltk.classify.decisiontree import DecisionTreeClassifier tree = DecisionTreeClassifier.train(train_set) from nltk.classify.naivebayes import NaiveBayesClassifier bayes = NaiveBayesClassifier.train(train_set) # Test the classifiers from nltk.classify import accuracy print("Decision Tree accuracy: ", accuracy(tree, test_set)) print("Naive Bayes accuracy: ", accuracy(bayes, test_set))
def f(x): return DecisionTreeClassifier.train(x, binary=True, verbose=True)
('bag_of_words', Pipeline([ ('selector', FeatureGetter(train_entries)), ('vect', CountVectorizer(ngram_range=(1, 1), max_features=500)), ('tfidf', TfidfTransformer(use_idf=True)), ])), # Weights for the features ], transformer_weights={ 'feature_1': 1.5, 'feature_2': 0.5, 'feature_3': 0.5, 'feature_4': 1.5, }, )), # Decision tree used as classifier ('clf', DecisionTreeClassifier(max_features=500)), ]) pipeline_pickle_path = 'data/pipeline.pickle' pickle.dump(pipeline, open(pipeline_pickle_path, 'wb')) print("About to run the pipeline...") train_data = sp.convertDataToList(sp.train) test_data = sp.convertDataToList(sp.train) train_entries, train_langs = sp.returnEntriesWithSpoken(train_data) test_entries, test_langs = sp.returnEntriesWithSpoken(test_data) pipeline.fit(train_entries, train_langs) y = pipeline.predict(train_entries) print(classification_report(y, test_langs))
if (len(elem[0]) > 3): negative.append(elem) combined = list(set(positive + negative)) feature_set = [] for i, elem in enumerate(total_text): # elem is the words in a document features = {} for word, number in combined: features['contains({})'.format(word)] = (word in elem) pos_or_neg = "positive" if (stars_array[i] >= 3) else "negative" feature_set.append((features, pos_or_neg)) print "Finished Featurization. It took " + str(time.time() - start_time) + " to run." cutoff = int(len(feature_set) * 3/4) training = feature_set[:cutoff] testing = feature_set[cutoff:] classifier = DecisionTreeClassifier.train(training, binary=True, verbose=True) count = 0 for elem in testing: val = classifier.classify(elem[0]) if val == elem[1]: count += 1 print float(count)/len(testing)