def treina_classificadores(): posdados = [] with open('./dadostreino/train_EPTC_POA_v3nbal_1.data', 'rb') as myfile: reader = csv.reader(myfile, delimiter=',') for val in reader: posdados.append(val[0]) negdados = [] with open('./dadostreino/train_EPTC_POA_v3nbal_0.data', 'rb') as myfile: reader = csv.reader(myfile, delimiter=',') for val in reader: negdados.append(val[0]) neudados = [] with open('./dadostreino/train_EPTC_POA_v3nbal_2.data', 'rb') as myfile: reader = csv.reader(myfile, delimiter=',') for val in reader: neudados.append(val[0]) negfeats = [(bag_of_words(f), 'neg') for f in divide(negdados)] posfeats = [(bag_of_words(f), 'pos') for f in divide(posdados)] neufeats = [(bag_of_words(f), 'neu') for f in divide(neudados)] treino = negfeats + posfeats + neufeats #'Maximum Entropy' classificadorME = MaxentClassifier.train(treino, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) #SVM classificadorSVM = SklearnClassifier(LinearSVC(), sparse=False) classificadorSVM.train(treino) # Naive Bayes classificadorNB = NaiveBayesClassifier.train(treino) return ([classificadorME, classificadorSVM, classificadorNB])
def test_svc_returns_correct_result(self): train_data = [({ "a": 4, "b": 1, "c": 0 }, "ham"), ({ "a": 5, "b": 2, "c": 1 }, "ham"), ({ "a": 0, "b": 3, "c": 4 }, "spam"), ({ "a": 5, "b": 1, "c": 1 }, "ham"), ({ "a": 1, "b": 4, "c": 3 }, "spam")] classif = SklearnClassifier(SVC(), sparse=False).train(train_data) test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}] ccm = classif.classify_many(test_data) self.assertEqual(ccm, ['ham', 'spam'])
def test_bernollinb_returns_correct_result(self): train_data = [({ "a": 4, "b": 1, "c": 0 }, "ham"), ({ "a": 5, "b": 2, "c": 1 }, "ham"), ({ "a": 0, "b": 3, "c": 4 }, "spam"), ({ "a": 5, "b": 1, "c": 1 }, "ham"), ({ "a": 1, "b": 4, "c": 3 }, "spam")] classif = SklearnClassifier(BernoulliNB()).train(train_data) test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}] ccm = classif.classify_many(test_data) ['ham', 'spam'] self.assertEqual(ccm, ['ham', 'spam'])
def LG_gender(train_set, test_set): print('== SkLearn MaxEnt ==') from nltk.classify import SklearnClassifier from sklearn.linear_model import LogisticRegression sklearn_classifier = SklearnClassifier( LogisticRegression(C=10e5)).train(train_set) print(sklearn_classifier.prob_classify(gender_features('mark'))._prob_dict) print(nltk.classify.accuracy(sklearn_classifier, test_set))
class MachineLearningNLP: def __init__(self, classifier_type='NaiveBayes', feats=word_feats): # "Thumbs up? Sentiment Classification using Machine Learning Techniques classifier_list = ['NaiveBayes', 'MaximumEntropy', 'SVM'] if classifier_type in classifier_list: self.classifier_type = classifier_type else: print("Classifier Type is not implemented: " + classifier_type) if self.classifier_type == 'MaximumEntropy': self.classifier = MaxentClassifier elif self.classifier_type == 'SVM': self.classifier = SklearnClassifier(LinearSVC(), sparse=False) elif self.classifier_type == 'NaiveBayes': self.classifier = NaiveBayesClassifier self.feats = feats def convert_txt(self, file_neg, file_pos): negfeats = list(map(self.feats, word_preprocess(file_neg))) posfeats = list(map(self.feats, word_preprocess(file_pos))) negfeats = list(zip(negfeats, ['neg'] * len(negfeats))) posfeats = list(zip(posfeats, ['pos'] * len(posfeats))) # negfeats = [(self.feats(f), 'neg') for f in word_preprocess(file_neg)] # posfeats = [(self.feats(f), 'pos') for f in word_preprocess(file_pos)] return (negfeats, posfeats) def train(self, train_data, **kwargs): self.classifier = self.classifier.train(train_data, **kwargs) def predict(self, test_data): return [self.classifier.classify(feats) for feats, label in test_data] def annotate(self, text): assert isinstance(text, str) text_Encoded = self.feats(text.split()) return self.classifier.classify(text_Encoded) def performance(self, test_data): prediction = self.predict(test_data) pos_loc = set( [i for i in range(len(prediction)) if prediction[i] == 'pos']) neg_loc = set(range(len(prediction))) - pos_loc pos_ref = set( [i for i in range(len(prediction)) if test_data[i][1] == 'pos']) neg_ref = set(range(len(prediction))) - pos_ref print('===============================\n') print('Model Summary:\n') print(self.classifier_type + ' with features ' + self.feats.__name__ + '\n') print('Overall Accuracy: %.3f\n' % (nltk.classify.util.accuracy(self.classifier, test_data))) print('Positive Precision: %.3f\n' % (nltk.precision(pos_ref, pos_loc))) print('Positive Recall: %.3f\n' % (nltk.recall(pos_ref, pos_loc))) print('Negative Precision: %.3f\n' % (nltk.precision(neg_ref, neg_loc))) print('Negative Recall: %.3f\n' % (nltk.recall(neg_ref, neg_loc)))
class ModelGenerator(object): def __init__(self): self.pre_pro = TweetPreprocessor() self.classifier = SklearnClassifier(MultinomialNB(alpha=1.375)) neg_twts = [(self.process_tweet(twt), "negative") for twt in twitter_samples.strings('negative_tweets.json')] pos_twts = [(self.process_tweet(twt), "positive") for twt in twitter_samples.strings('positive_tweets.json')] all_twts = neg_twts + pos_twts acc_scores, confusion_matrix = self.cross_validate(self.classifier, all_twts, 10) self.classifier.train(all_twts) print("Initialised classifier with an accuracy of {:.2f}%, +/- {:.2f}%" .format(mean(acc_scores) * 100, stdev(acc_scores) * 2 * 100)) print("Confusion matrix: \n{}".format(confusion_matrix)) def process_tweet(self, tweet): words = self.pre_pro.tokenise_tweet(tweet) words_wo_htgs = [self.pre_pro.strip_hash(word) for word in words] useful_words = [w for w in words_wo_htgs if self.pre_pro.is_useful_word(w)] stemmed_words = [self.pre_pro.stem(word) for word in useful_words] return self.pre_pro.create_word_features(stemmed_words) def persist(self): pickle.dump(self.classifier, open("model.p", "wb")) @staticmethod def cross_validate(algo, data, num_folds): acc_scores = [] predicted_results = [] actual_results = [] for i in range(0, num_folds): train_data = copy(data) test_data = train_data[i::num_folds] # stratifies the data by picking out every nth element, with increasing offset del train_data[i::num_folds] # removes the test data from the training dataset trained_algo = algo.train(train_data) accuracy = nltk.classify.util.accuracy(trained_algo, test_data) acc_scores.append(accuracy) for td in test_data: predicted_results.append(trained_algo.classify(td[0])) actual_results.append(td[1]) confusion_mat = nltk.ConfusionMatrix(actual_results, predicted_results) return acc_scores, confusion_mat
def searchNuSVC_classifier(title, train_departments): """ Nu-Support Vector Classification. :param title: :param train_departments: :return: """ classifier = SklearnClassifier(NuSVC()) classifier.train(train_departments) test_sent_features = word_feats(title) return classifier.classify(test_sent_features)
def read(filename): fp = open(filename, "r") f = fp.readlines() vocab = [s.encode('utf-8').split() for s in f] #print vocab voc_vec = word2vec.Word2Vec(vocab, min_count=1, size=4) #print voc_vec.syn0.shape #print type(voc_vec['yav']) #Openning data file fp.close() fp = open("test_data.txt", "r") f = fp.read() tokens = nltk.word_tokenize(f) D = OrderedDict() sentences = [] #print len(tokens) for word in tokens[0:200]: D[word.split("|")[0]] = word.split("|")[1] sentences.append(word.split("|")[0]) #print D train_data = [] for key in D: l = voc_vec[key] x = {} x['a'] = l[0] x['b'] = l[1] x['c'] = l[2] x['d'] = l[3] train_data.append((x, D[key])) classif = SklearnClassifier(BernoulliNB()).train(train_data) #print train_data test_data = [] D2 = OrderedDict() for word in tokens[200:300]: D2[word.split("|")[0]] = word.split("|")[1] expected_list = [] for key in D2: l = voc_vec[key] x = {} x['a'] = l[0] x['b'] = l[1] x['c'] = l[2] x['d'] = l[3] test_data.append(x) expected_list.append(D2[key]) predicted = classif.classify_many(test_data) print len(predicted) print len(expected_list) print accuracy_score(expected_list, predicted, normalize=False)
def predict_nltk(in_text='', n=2): ''' Text language classification Then use scikit-learn classifiers from within NLTK to classify new taxt based on training set. ''' trainingset = [] for label in text: featurs = text_features(text[label]) trainingset.append((featurs, label)) classifier = SklearnClassifier(MultinomialNB()).train(trainingset) in_features = text_features(in_text, n=n) lang = classifier.classify(in_features) print 'Language:', lang
class LinearSVC2Model(SKLearnModel): """This model classifies tweets into any one of twenty classes using SVM classification. """ def __init__(self, balanced=False, C=1.0, dual=True, tol=1e-4, max_iter=1000, loss="squared_hinge") -> None: # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. self.tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize # set class_weight to None unless the 'balanced' has been set to true in the config class_weight = None # type: Optional[str] if balanced: class_weight = "balanced" # Here we create the pipeline for the classifier. # The TfidfTransformer is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. # The LinearSVC sets up a Linear Support Vector Machine classifier. This is different because than using SCV # with a Linear kernel because it uses liblinear as a backend instead of libsvm. This makes it run a lot faster. pipeline = Pipeline([('tfidf', TfidfTransformer()), ('linearsvc', LinearSVC(class_weight=class_weight, C=C, dual=dual, tol=tol, max_iter=max_iter, loss=loss))]) self.classif = SklearnClassifier(pipeline) @staticmethod def get_extra_configs(): configs = [{"name": "balanced", "default": False}, {"name": "C", "default": 1.0}, {"name": "dual", "default": True}, {"name": "tol", "default": 1e-4}, {"name": "max_iter", "default": 1000}, {"name": "loss", "default": "squared_hinge"}] # add config for balanced. return super(LinearSVC2Model, LinearSVC2Model).get_extra_configs() + configs def train(self, tweets: List[Tweet]) -> None: def tweet_to_tuple(x): return (FreqDist(self.tokenizer(x.text)), x.emoji) # Generates tuples of all the tweets to form the corpus corpus = map(tweet_to_tuple, tweets) # Train this model! self.classif.train(corpus) def predict(self, text): return self.classif.classify(FreqDist(self.tokenizer(text)))
def __init__(self, classifier_type='NaiveBayes', feats=word_feats): # "Thumbs up? Sentiment Classification using Machine Learning Techniques classifier_list = ['NaiveBayes', 'MaximumEntropy', 'SVM'] if classifier_type in classifier_list: self.classifier_type = classifier_type else: print("Classifier Type is not implemented: " + classifier_type) if self.classifier_type == 'MaximumEntropy': self.classifier = MaxentClassifier elif self.classifier_type == 'SVM': self.classifier = SklearnClassifier(LinearSVC(), sparse=False) elif self.classifier_type == 'NaiveBayes': self.classifier = NaiveBayesClassifier self.feats = feats
def __init__(self, load_clf=False, load_tr_data=False): self.features = self.__load_support_vector_features() self.training_data = [] self.n_samples = 0 self.all_tweets = self.__load_tweets_from_file() # list not dict # Classifier loading if load_clf: self.load_clf() else: self.clf = SklearnClassifier(SVC(), sparse=False) # Training Data loading if load_tr_data: self.__load_training_data()
def run_program(is_testing, mode): """########## CHECKING WHAT THE PROGRAM IS GOING TO EXECUTE ##########""" print(" ") print(print_vals(is_testing, mode)) """###################################################################""" iteration = 0 file_path = '' if is_testing: file_path = 'Data/datasets/test.csv' else: file_path = 'Data/datasets/training.csv' load_csv(file_path, mode) features = feature_choices() number_of_labels = int(len(labels)) iteration = 0 weighted_data = select_features(features) print("Training Classifier: ") classifier = SklearnClassifier( LinearSVC(loss='squared_hinge', max_iter=999999)).train(weighted_data) # make_predictions() return None
def trainClassifier(trainData): pipeline = Pipeline([('svc', LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=0, tol=0.0001, verbose=0))]) return SklearnClassifier(pipeline).train(trainData)
def __init__(self, kernel: str = "") -> None: # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. self.tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize # Here we create the pipeline for the classifier. # The TfidfTransformer is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. # The SVC sets up a Support Vector Machine classifier with the configured kernel. # In this case it is either a linear or a radial basis function kernel. # The details for the above items are discussed in the model's readme. pipeline = Pipeline([('tfidf', TfidfTransformer()), ('{}svc'.format(kernel), SVC(kernel=kernel))]) self.classif = SklearnClassifier(pipeline)
def m_train(): train = [] with codecs.open('data/train_chunked_double.data', mode='r', encoding='UTF-8') as file: for line in file.readlines(): line = line.strip('\n') line = line.strip('\r') pair = line.split(',') e = pair[0] z = pair[1] for j in range(len(z)): x = gen_x(e, z, j) y = z[j] train.append((x, y)) try: clas = SklearnClassifier( LogisticRegression(solver='lbfgs', n_jobs=-1, max_iter=200)).train(train) save_model(clas) return clas except Exception as e: print('Error: %r' % e) return None
def train_and_save_model(data_set_name="NB_Model_Tatoeba_", n=2): trainingset = [] for i, label in enumerate(targets): featurs = text_features(data[i], n) trainingset.append((featurs, label)) classifier = SklearnClassifier(MultinomialNB()).train(trainingset) save(data_set_name + str(n) + "n", classifier) return classifier
def __init__(self): self.pre_pro = TweetPreprocessor() self.classifier = SklearnClassifier(MultinomialNB(alpha=1.375)) neg_twts = [(self.process_tweet(twt), "negative") for twt in twitter_samples.strings('negative_tweets.json')] pos_twts = [(self.process_tweet(twt), "positive") for twt in twitter_samples.strings('positive_tweets.json')] all_twts = neg_twts + pos_twts acc_scores, confusion_matrix = self.cross_validate(self.classifier, all_twts, 10) self.classifier.train(all_twts) print("Initialised classifier with an accuracy of {:.2f}%, +/- {:.2f}%" .format(mean(acc_scores) * 100, stdev(acc_scores) * 2 * 100)) print("Confusion matrix: \n{}".format(confusion_matrix))
def train_using_SklearnClassifier(self, training_data, test_data): # Giving bad results. Don't use. classifier = SklearnClassifier(BernoulliNB()).train(training_data) classifier2 = SklearnClassifier(SVC(), sparse=False).train(training_data) print(classifier) classifier_name = type(classifier).__name__ training_set_accuracy = nltk.classify.accuracy(classifier, training_data) training_set_accuracy2 = nltk.classify.accuracy( classifier2, training_data) test_set_accuracy = nltk.classify.accuracy(classifier, test_data) test_set_accuracy2 = nltk.classify.accuracy(classifier2, test_data) print(">>>>>>>>") print(training_set_accuracy, test_set_accuracy) print(training_set_accuracy2, test_set_accuracy2) return classifier, classifier_name, test_set_accuracy, training_set_accuracy
def _train(self): pickle_filename = "{0}.pickle".format(self.__class__.__name__) if os.path.isfile(pickle_filename): with open(pickle_filename, "rb") as classifier_f: self._classifier = pickle.load(classifier_f) classifier_f.close() else: train_set = [(self._extract_features(cascade), cascade['label']) for cascade in self._dataset] gbc_clf = GradientBoostingClassifier(n_estimators=1000) self._classifier = SklearnClassifier(gbc_clf, sparse=False).train(train_set) with open(pickle_filename, "wb") as save_classifier: pickle.dump(self._classifier, save_classifier) save_classifier.close()
def _train(self): pickle_filename = "{0}.pickle".format(self.__class__.__name__) if os.path.isfile(pickle_filename): with open(pickle_filename, "rb") as classifier_f: self._classifier = pickle.load(classifier_f) classifier_f.close() else: train_set = [(self._extract_features(cascade), cascade['label']) for cascade in self._dataset] pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('rf', SVC(kernel='linear', probability=True))]) self._classifier = SklearnClassifier(pipeline, sparse=False).train(train_set) with open(pickle_filename, "wb") as save_classifier: pickle.dump(self._classifier, save_classifier) save_classifier.close()
def main(): """Main.""" from sklearn.svm import SVC from nltk.classify import SklearnClassifier classifier = SklearnClassifier(SVC(kernel="rbf"), sparse=False) _train(classifier) _test(classifier)
def leaveKOutValidation(k=1): accuracy = 0.0 print("Performing leave-"+str(k)+"-out cross-validation") gamesClusters = [feats[int(i*k):int((i+1)*k)] for i in range(int(len(feats)/k))] for games in gamesClusters: training = [x for x in feats if x not in games] pipeline = Pipeline([('tfidf', TfidfTransformer()), #('chi2', SelectKBest(chi2, k=250)), ('nb', MultinomialNB())]) classifier = SklearnClassifier(pipeline).train(training) rw = [] for game in games: classification = classifier.classify(game[0]) accuracy += int((game[1] > 0) == (classification > 0)) / float(len(feats)) print("With leave-"+str(k)+"-out cross-validation, the algorithm is "+str(round(accuracy*100,4))+"% accurate")
def LG_gender(train_set): print('== SkLearn MaxEnt ==') from nltk.classify import SklearnClassifier from sklearn.linear_model import LogisticRegression sklearn_classifier = SklearnClassifier( LogisticRegression(C=10e5)).train(train_set) return sklearn_classifier
def evaluate_classifier(featx,collocationFunc): #negFiles = movie_reviews.fileids('neg') #posFiles = movie_reviews.fileids('pos') #negWordsList=[movie_reviews.words(fileids=[f]) for f in negFiles] #posWordsList=[movie_reviews.words(fileids=[f]) for f in posFiles] #negfeats = [(featx(negWords), 'neg') for negWords in negWordsList] #posfeats = [(featx(posWords), 'pos') for posWords in posWordsList] negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'pos') for f in posids] #lenNegFeats=min(len(negfeats),24) #lenPosFeats=min(len(posfeats),24) lenNegFeats=len(negfeats) lenPosFeats=len(posfeats) negcutoff = int(lenNegFeats*3/4) poscutoff = int(lenPosFeats*3/4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:lenNegFeats] + posfeats[poscutoff:lenPosFeats] #classifier = MaxentClassifier.train(trainfeats) classifier = SklearnClassifier(BernoulliNB()).train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) evaluationMetrics={} print(classifier) evaluationMetrics['accuracy']=nltk.classify.util.accuracy(classifier, testfeats) evaluationMetrics['posPrec']=nltk.precision(refsets['pos'], testsets['pos']) evaluationMetrics['posRecall']=nltk.recall(refsets['pos'], testsets['pos']) evaluationMetrics['posF_Score']=nltk.f_measure(refsets['pos'], testsets['pos']) evaluationMetrics['negPrec']=nltk.precision(refsets['neg'], testsets['neg']) evaluationMetrics['negRecall']=nltk.recall(refsets['neg'], testsets['neg']) evaluationMetrics['negF_Score']=nltk.f_measure(refsets['neg'], testsets['neg']) return evaluationMetrics
def bag_of_words_model(df, column_name, target='label', k=1000): """ """ pos_array = df[(df[target] == 1)][column_name].values neg_array = df[(df[target] == 0)][column_name].values pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=k)), ('nb', MultinomialNB())]) clf = SklearnClassifier(pipeline) pos = [FreqDist(word_list) for word_list in pos_array] neg = [FreqDist(word_list) for word_list in neg_array] add_label = lambda lst, lab: [(x, lab) for x in lst] trained_clf = clf.train(add_label(pos, 1) + add_label(neg, 0)) return trained_clf
def trainClassifier(self): self.initPipeline() # Create the multinomial NB classifier self.classifier = SklearnClassifier(self.pipeline) # Train the classifier self.classifier.train(self.trainingSet) # End func return return # End trainClassifier override # End sub class
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] # using 3 classifiers classifier_list = ['nb', 'maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, sparse=True, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' print(trainfeats) classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats)
def train( self, observations , k=5 ): ''' An ensamble K-Fold Classifier ''' self.forest = [] splitdata = np.array_split(observations, k) combos = list(reversed(list(itertools.combinations(splitdata, k-1)))) accuracy_sum = 0 for i in range(k): train = list(itertools.chain(*combos[i])) test = splitdata[i] if k==1: train = observations test = observations c = SklearnClassifier(RandomForestClassifier()) #c = SklearnClassifier(cls) c.train(train) accuracy_sum += nltk.classify.accuracy(c,test) self.forest.append(c) print('Accuracy on Train data(Using K fold)= ', accuracy_sum/k )
def train_scikit_model(best_features, feature_set, split_name, classifier_name): #train on the training data of word_features #find which classifier model to use if classifier_name == "nb": cls = nltk.classify.NaiveBayesClassifier.train(best_features) elif classifier_name == "nb_sk": cls = SklearnClassifier(BernoulliNB()).train(best_features) elif classifier_name == "dt": cls = nltk.classify.DecisionTreeClassifier.train(best_features) elif classifier_name == "dt_sk": cls = SklearnClassifier( tree.DecisionTreeClassifier()).train(best_features) elif classifier_name == "svm_sk" or classifier_name == "svm": cls = SklearnClassifier(svm.SVC()) else: assert False, "unknown classifier name:{}; known names: nb, dt, svm, nb_sk, dt_sk, svm_sk".format( classifier_name) return cls
def train(self, observations, k=5): ''' An ensamble K-Fold Classifier ''' self.forest = [] splitdata = np.array_split(observations, k) combos = list(reversed(list(itertools.combinations(splitdata, k - 1)))) accuracy_sum = 0 for i in range(k): train = list(itertools.chain(*combos[i])) test = splitdata[i] if k == 1: train = observations test = observations c = SklearnClassifier(RandomForestClassifier()) #c = SklearnClassifier(cls) c.train(train) accuracy_sum += nltk.classify.accuracy(c, test) self.forest.append(c) print('Accuracy on Train data(Using K fold)= ', accuracy_sum / k)
def searchSGDClassifier_classifier(title, train_departments): """ :param title: :param train_departments: :return: """ timeTraning = time.time() classifier = SklearnClassifier(SGDClassifier(loss='log')) classifier.train(train_departments) timeTraning = time.time() - timeTraning test_sent_features = word_feats(title) timeClassify = time.time() found_department = classifier.classify(test_sent_features) timeClassify = time.time() - timeClassify probability = classifier.prob_classify(test_sent_features) print(probability.prob(found_department)) return [ found_department, probability.prob(found_department), accuracy(classifier, train_departments[1000:]), timeClassify, timeTraning, ]
def searchLinearSVC(title, train_departments): """ Linear SVC :param title: :param train_departments: :return: """ timeTraning = time.time() #classifier = SklearnClassifier(LinearSVC(probability=True)) classifier = SklearnClassifier(SVC(kernel='linear', probability=True)) classifier.train(train_departments) timeTraning = time.time() - timeTraning test_sent_features = word_feats(title) timeClassify = time.time() found_department = classifier.classify(test_sent_features) timeClassify = time.time() - timeClassify probability = classifier.prob_classify(test_sent_features) print(probability.prob(found_department)) return [ found_department, probability.prob(found_department), accuracy(classifier, train_departments[1000:]), timeClassify, timeTraning, ]
def ml_sentiment(self, text): ''' Machine Learning for Sentiment detection. ''' trainingset = [] for tweet in self.data: trainingset.append(self.sentiment_featrues(tweet)) #classifier = nltk.NaiveBayesClassifier.train(trainingset) #classifier = nltk.DecisionTreeClassifier.train(trainingset) classifier = SklearnClassifier(MultinomialNB()).train(trainingset) tokenz = self.ml_tag(text, print_tags=False) tweet = { 'tokens': tokenz, 'sentiment': '' } tokenz_features = self.sentiment_featrues(tweet) #print tokenz_features sentiment = classifier.classify(tokenz_features[0]) #print text, sentiment tweet['sentiment'] = sentiment print '\nTweet:', text self.show_tweet(tweet) return sentiment
class SentimentMNB(SentimentClassifier): # Sub class constructor def __init__(self, chiK=3368): # Call the super class constructor which initializes the classifier self.chiK = chiK super(SentimentMNB, self).__init__() # End func return return # End wrapper class constructor # Function to initialize the classifier pipeline def initPipeline(self): # Pipeline of transformers with a final estimator # The pipeline class behaves like a compound classifier # pipeline(steps=[...]) # Old MNB pipeline with TFIDF # self.pipeline = Pipeline([('tfidf', TfidfTransformer()), # ('chi2', SelectKBest(chi2, k=1000)), # ('nb', MultinomialNB())]) self.pipeline = Pipeline([('chi2', SelectKBest(chi2, k=self.chiK)), ('nb', MultinomialNB())]) # End func return return # End initPipeline # Overriding func to train multinomial NB classifier def trainClassifier(self): self.initPipeline() # Create the multinomial NB classifier self.classifier = SklearnClassifier(self.pipeline) # Train the classifier self.classifier.train(self.trainingSet) # End func return return # End trainClassifier override # End sub class
##0-Suffix,1-Previous Number,2-Next Number ,3-Previous wordform,4-next wordform, 5-post position,6-present word form,7-POS#### from preprocess_train import features,number; from preprocess_test import features_test,number_test; from nltk.classify import SklearnClassifier from sklearn.naive_bayes import BernoulliNB from sklearn.svm import SVC train_data=[[x for x in range(2)] for y in range(357)] test_data=[[x for x in range(1)] for y in range(11)] for i in range(0,357): train_data[i][0]={'Suffix':features[i][0], 'Previous morph':features[i][1],'Next morph':features[i][2],'Previous wordform':features[i][3], 'Next wordform':features[i][4],'postposition':features[i][5],'wordform':features[i][6],'pos':features[i][7]} train_data[i][1]=number[i] for i in range(0,11): test_data[i]={'Suffix':features_test[i][0], 'Previous morph':features_test[i][1],'Next morph':features_test[i][2],'Previous wordform':features_test[i][3], 'Next wordform':features_test[i][4],'postposition':features_test[i][5],'wordform':features[i][6],'pos':features[i][7]} classif = SklearnClassifier(SVC(), sparse=False).train(train_data) result=classif.classify_many(test_data) classif1 = SklearnClassifier(BernoulliNB()).train(train_data) result1=classif1.classify_many(test_data) print result1
#Determine training, test and dev sets size = int(round((len(rawData) * 0.15), 0)) random.shuffle(rawData) testData = rawData[:size] trainData = rawData[size:] random.shuffle(trainData) # Generate TermFrequency for each doc trainTF = [(FreqDist(tokenize(text)), tag) for text, tag in trainData] testTF = [(FreqDist(tokenize(text)), tag) for text, tag in testData] # Create classifier pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('nb', MultinomialNB())]) classif = SklearnClassifier(pipeline) # Train classifier classif.train(trainTF) # Evaluate testTags = [tag for tf, tag in testTF] testResults = classif.batch_classify([tf for tf, tag in testTF]) right = 0 for i, tg in enumerate(testTags): if testResults[i] == tg: right += 1 print 'Results: ------------------------------------' print testResults
def evaluate_bow(): lines = codecs.open(BC3_LABELLED_FILE, "r").readlines() data = [] gold = [] for i, line in enumerate(lines): tokens = line.strip().split() if len(tokens) > 2: label = tokens.pop(0) tag = tokens.pop(0) if tag == "none": continue if i < len(lines) and len(lines[i + 1].strip().split()) > 2: lines[i + 1].strip().split().pop(0) next_label = lines[i + 1].strip().split().pop(0) else: next_label = "T" gold.append(tag) data.append((FreqDist(tokens), tag, next_label)) limit = int(float(len(data)) * 0.8) # training set: bags-of-words and tag tuples train = [(bow, tag) for bow, tag, next_label in data[:limit]] # training the classifier classifier = SklearnClassifier(MultinomialNB()).train(train) results = { "segmented": [], "unsegmented": [] } all_choices = [] # all choices made choices = [] # choices for the current segment nb = 1 # number of lines in the segment for i, (bow, tag, next_label) in enumerate(data[limit:]): # bow classification choice = classifier.classify(bow) choices.append(choice) all_choices.append(choice) # line by line classification for unsegmented results results["unsegmented"].append(choice) # more complex classification for segmented results if next_label == "T": most_common = Counter(choices).most_common() if len(most_common) > 1: tf = FreqDist(all_choices) vote = most_common[0] best = 1 for candidate, occ in most_common: if tf[candidate] > best: vote = candidate best = tf[candidate] else: vote, occ = most_common[0] results["segmented"] += [vote for choice in choices] choices = [] nb = 1 else: nb += 1 # incrementing the current number of lines in the bag for i, label in enumerate(gold[limit:]): bow, tag, next_label = data[i + limit] print("# {0}\t{1}\t{2}".format(label, results["unsegmented"][i], results["segmented"][i])) if next_label == "T": print("# ------------------") # segmented metrics sp = metrics.precision_score(gold[limit:], results["segmented"]) sr = metrics.recall_score(gold[limit:], results["segmented"]) sf = (2.0 * (sr * sp)) / (sr + sp) # unsegmented metrics up = metrics.precision_score(gold[limit:], results["unsegmented"]) ur = metrics.recall_score(gold[limit:], results["unsegmented"]) uf = (2.0 * (ur * up)) / (ur + up) print("#") print("# Pre.:\t\tRec:\t\tF1:") print("# segmented: {0}%\t\t{1}%\t\t{2}%".format(dec(sp * 100), dec(sr * 100), dec(sf * 100))) print("# non-segmented: {0}%\t\t{1}%\t\t{2}%".format(dec(up * 100), dec(ur * 100), dec(uf * 100)))
# academic institution # import csv import numpy as np from nltk.probability import FreqDist from nltk.classify import SklearnClassifier from nltk.tokenize import RegexpTokenizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline pipeline = [('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=20)), ('nb', MultinomialNB())] classif = SklearnClassifier(Pipeline(pipeline)) # just break on gaps -- note that this doesn't filter out punctuation tokenizer = RegexpTokenizer('[\w\d]+') training_set = [] with open('train_jlm.csv', 'rb') as f: reader = csv.reader(f) for row in reader: if row[0] != 'OrganisationId': # header words = tokenizer.tokenize(row[1]) if row[4] == 'Academic': training_set.append((words, 'academic')) else: training_set.append((words, 'private'))
""" Linear (Bernoulli) SVC Implementation of Support Vector Machine classifier using libsvm: the kernel can be non-linear but its SMO algorithm does not scale to large number of samples as LinearSVC does. """ from nltk.classify import SklearnClassifier from sklearn.naive_bayes import BernoulliNB from sklearn.svm import SVC print " " print "=============================" print "Bernoulli SVC Classifier:" classifierBi = SklearnClassifier(BernoulliNB()).train(train_set) classifierBi.classify_many(test) for pdist in classifierBi.prob_classify_many(test): print pdist.prob("human"), pdist.prob("auto") for i in range(len(classifierBi.classify_many(test))): print classifierBi.classify_many(test)[i] classifierSVC = SklearnClassifier(SVC(), sparse=True).train(train_set) classifierSVC.classify_many(test) # svc = nltk.classify.accuracy(classifierSVC, test_set) # print 'accuracy is %.2f' %round(svc*100,4), '%' def SVC(): classifierBi = SklearnClassifier(BernoulliNB()).train(train_set)
#region SVMClassifier WriteLog("\nEntering SVM", ClassificationLogFile) trainD = list() testD = list() gTruth = list() #Formatting the Data for dictPair in training_set: trainD.append(dictPair) for dictPair in testing_set: testD.append(dictPair[0]) gTruth.append(dictPair[1]) WriteLog("Starting SVM Training", ClassificationLogFile) SVMClassifier = SklearnClassifier(SVC(), sparse=False).train(trainD) SVMPredictions = SVMClassifier.classify_many(testD) WriteLog("SVM Training Set Accuracy:", ClassificationLogFile) WriteLog(str(accuracy_score(gTruth, SVMPredictions, normalize=True, sample_weight=None)), ClassificationLogFile) #SVM Classification WriteLog("SVM Classification", ClassificationLogFile) DoClassify(SVMClassifier, SVMtopicResultsTxt, topicTweetsLDATxt) #SVM Predictions WriteLog("SVM Predictions:", ClassificationLogFile) WriteLog(SVMPredictions, ClassificationLogFile) #endregion #region NaiveBayes
import numpy as np from nltk.probability import FreqDist from nltk.classify import SklearnClassifier from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline #pipeline = Pipeline([('tfidf', TfidfTransformer()), # ('chi2', SelectKBest(chi2, k=1000)), # ('nb', MultinomialNB())]) #classif = SklearnClassifier(pipeline) classif = SklearnClassifier(MultinomialNB()) add_label = lambda lst, lab: [(x, lab) for x in lst] import justTry all_w, per = justTry.getWords(0) print len(per[0]), len(per[1]), len(per[2]), len(per[3]), len(per[4]), train1 = (9*len(per[0]))/10 train2 = (9*len(per[1]))/10 train3 = (9*len(per[2]))/10 train4 = (9*len(per[3]))/10 train5 = (9*len(per[4]))/10 ones = [FreqDist(x) for x in per[0]] twos = [FreqDist(x) for x in per[1]] threes = [FreqDist(x) for x in per[2]]
from awesome_print import ap from nltk import NaiveBayesClassifier from nltk.util import ngrams from nltk.metrics import scores from nltk.classify import SklearnClassifier from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('nb', MultinomialNB())]) classif = SklearnClassifier(pipeline) def extract_featurelabel(student): features = {} bigrams = ngrams(student['Student Comment'],2) for word in student['Student Comment']: features['contains (%s)'%word] = word for bigram in bigrams: features['contains (%s)'%(' '.join(bigram))] = ' '.join(bigram) return (features,student['Physician Comment']) #return (features,find_student_grade(student['Name'])) def find_student_grade(name): #Assumes that a dictionary with each student's name and grade has been created #Only look for last name because the first name was not recorded for all students
def NBtfidf(): classifierTF = SklearnClassifier(pipeline).train(train_set) return classifierTF.classify_many(test)
def LinSVC(): classifierLinSVC = SklearnClassifier(LinearSVC(), sparse=False).train(train_set) return classifierLinSVC.classify_many(test)
tweets = [] stop_words = set(stopwords.words('english')) for (words, sentiment) in train: words_filtered = [e.lower() for e in words.split() if e not in stop_words] tweets.append((words_filtered, sentiment)) # print tweets # word_features = get_word_features(get_words_in_tweets(tweets)) # training_set = nltk.classify.apply_features(extract_features, tweets) training_set=traindict(tweets) print training_set # classifier = nltk.NaiveBayesClassifier.train(training_set) classifier = SklearnClassifier(SVC(), sparse=False).train(training_set) tweetd = 'I have cows :(' print classifier.classify(dict(Counter(clean(tweetd.lower())))) # tweetd = 'Obama is boring :(' # print classifier.classify(extract_features(tweetd.lower().split()))
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'Reading Tweets\n' tweets_data_path = '20161019_202620.txt' tweets_data = [] tweets_file = open(tweets_data_path, "r") for line in tweets_file: try: tweet = json.loads(line) tweets_data.append(tweet) except: continue tweets = pd.DataFrame() tweets['text'] = [tweet.get('text','') for tweet in tweets_data] tdata = tweets['text'] negfeats = [(featx(f), 'neg') for f in word_split(tdata)] testfeats = negfeats print np.shape(testfeats) #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] #print np.shape(testfeats) # using 3 classifiers classifier_list = ['nb', 'maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print testsets[observed] accuracy = nltk.classify.util.accuracy(classifier, testfeats) #pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos']) #pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos']) #pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos']) #neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg']) #neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg']) #neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg']) print '' print '---------------------------------------' print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')' print '---------------------------------------' print 'accuracy:', accuracy
import numpy import scipy from nltk.classify import maxent nltk.classify.MaxentClassifier.ALGORITHMS # ['GIS','IIS','CG','BFGS','Powell','LBFGSB','Nelder-Mead','MEGAM','TADM'] # MEGAM or TADM are not rec'd for text classification mec = nltk.classify.MaxentClassifier.train(train_features, 'GIS', trace=0, max_iter=1000) from sklearn import cross_validation cv = cross_validation.KFold(len(train_features), n_folds=10, indices=True, shuffle=False, random_state=None) for traincv, evalcv in cv: classifier = nltk.NaiveBayesClassifier.train(train_features[traincv[0]:traincv[len(traincv)-1]]) print 'accuracy: %.3f' % nltk.classify.util.accuracy(classifier, train_features[evalcv[0]:evalcv[len(evalcv)-1]]) import sklearn from sklearn.svm import LinearSVC from nltk.classify.scikitlearn import SklearnClassifier from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=2000)), ('nb', MultinomialNB())]) pipecl = SklearnClassifier(pipeline) pipecl.train(train_features)
def find_feature(document): words = set(document) feature = {} for w in words_feature: feature[w] = (w is words) return feature features = [(find_feature(rev), category) for (rev, category) in documents] testing_set = features[1900:] training_set = features[:1900] if not os.path.isfile(naivebayes): classifier = nltk.NaiveBayesClassifier.train(training_set) save_classifier = open(naivebayes, "wb") pickle.dump(classifier, save_classifier) save_classifier.close() else: classifier_f = open(naivebayes, "rb") classifier = pickle.load(classifier_f) classifier_f.close() print("Original Naive Bayes Classifier accuracy precent:", (nltk.classify.accuracy(classifier, testing_set) * 100)) MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) print("Multinomial Naive Bayes Classifier accuracy precent:", (nltk.classify.accuracy(classifier, testing_set) * 100))
pairs = [(classifier.classify(example), actual) for (example, actual) in test_set] do_evaluation (pairs) do_evaluation (pairs, pos_cls='neg') #%% Other classifier : SVM ################################################### # http://www.nltk.org/howto/classify.html # Run example from nltk.classify import SklearnClassifier from sklearn.svm import SVC t0 = time.time() classif = SklearnClassifier(SVC(), sparse=False).train(train_set) print(round(time.time()-t0,2)) classif.classify_many(test_set[0][0]) sizeTrain = [800] # the first 100, the first 300 ,etc testDoc = [800, 1000] # 800 to 999 classif.classify_many(test_set[0][0]) #%% SVM Class ################################################################ from nltk.classify import SklearnClassifier from sklearn.svm import SVC class SVM:
#neucutoff = len(neufeats)*4/5 length = 4 cutoff0 = len(feats0)*length/5 cutoff1 = len(feats1)*length/5 cutoff2 = len(feats2)*length/5 cutoff3 = len(feats3)*length/5 cutoff4 = len(feats4)*length/5 trainfeats = feats0[:cutoff0] + feats1[:cutoff1] + feats2[:cutoff2] + feats3[:cutoff3] + feats4[:cutoff4] testfeats = feats0[cutoff0:] + feats1[cutoff1:] + feats2[cutoff2:] + feats3[cutoff3:] + feats4[cutoff4:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) #classifier = NaiveBayesClassifier.train(trainfeats) #classifier = nltk.classify.DecisionTreeClassifier.train(trainfeats) classifier = SklearnClassifier(BernoulliNB()).train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) #classifier.show_most_informative_features() results = classifier.batch_classify([fs for (fs,l) in testfeats]) count = 0 ''' with open(loc_submission, "wb") as outfile: outfile.write("PhraseID,Sentiment\n") for val in results: outfile.write("%s,%s\n"%(df_test['PhraseId'][count],val)) count += 1 '''
def evaluate_features(feature_select): posFeatures = [] negFeatures = [] inposFeatures = [] innegFeatures = [] #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords), 'pos'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) """ with open(RT_INPUT_POS_FILE, 'r') as posSentences: for i in posSentences: inposWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) inposWords = [feature_select(inposWords), 'pos'] inposFeatures.append(inposWords) """ with open(RT_INPUT_NEG_FILE, 'r') as negSentences: for i in negSentences: innegWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) innegWords = [feature_select(innegWords), 'neg'] innegFeatures.append(innegWords) #selects 3/4 of the features to be used for training and 1/4 to be used for testing #posCutoff = int(math.floor(len(posFeatures)*3/4)) #negCutoff = int(math.floor(len(negFeatures)*3/4)) trainFeatures = posFeatures + negFeatures testFeatures = innegFeatures #+ inposFeatures #trains a Naive Bayes Classifier classifier = SklearnClassifier(BernoulliNB()).train(trainFeatures) #classifier = SklearnClassifier(SVC(probability=True), sparse=False).train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) fileOutput ={'key':[],'pos':[],'neg':[]} #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): #print features , label referenceSets[label].add(i) predicted = classifier.prob_classify_many(features) print "\n" #print predicted for item in predicted: fileOutput['key'].append(i) fileOutput['pos'].append(item.prob("pos")) fileOutput['neg'].append(item.prob("neg")) #posValues = predicted.prob("pos") #negValues = predicted.prob("neg") fileOutput.values() #testSets[predicted].add(i) #print i #print testSets[predicted] return fileOutput
def train(self, features_label): svm = SklearnClassifier(SVC(C=1000.0, gamma=0.0001)) self._classifier = svm.train(features_label) return None