def main(): me=Classifier() feature_counter=Counter() feature_set=pickle.load(open('validation_set.pkl', 'rb')) feature_set_labels=[] for tweet, rating in feature_set: print rating try: float(rating) except: continue if float(rating)>0: label='positive' elif float(rating)<0: label='negative' else: label='neutral' feature_set_labels.append((tweet, label)) feature_list=chain.from_iterable([word_tokenize(process_tweet(tweet)) for tweet, sentiment in feature_set_labels]) for feat in feature_list: feature_counter[feat]+=1 me.feature_list=[feat for feat, count in feature_counter.most_common(1000)] ts=[(me.extract_features(tweet), label) for tweet, label in feature_set] print 'training Maxent' me.classifier=MaxentClassifier.train(ts) return me
def main(): #INTAKE DATA & BUILD TRAINING/TEST SETS reviews_corpus = parseData() set_corpus_frequencies(reviews_corpus) training_data = reviews_corpus[:26] test_data = reviews_corpus[26:] #BUILD MAXENT MODEL training_set = get_training_feats(training_data) classifier = MaxentClassifier.train(training_set) #CLASSIFY, EXTRACT & EVAL BY TOPIC scores = [] baselines = [] for topic in test_data: extracted_summary = extract_summary_for_topic(topic, classifier) random_summary = random.sample(topic['lines'], len(extracted_summary)) score = RougeCalculator().score(extracted_summary, topic['gold_std']) baseline = RougeCalculator().score(random_summary, topic['gold_std']) scores.append(score) baselines.append(baseline) print "Summary for " + topic['topic'] + ':' print ''.join(extracted_summary) print "Rouge Score: " + str(score) print "Extracted Summary Rouge Average" print sum(scores) / len(scores) print "Baseline Summary Rouge Average" print sum(baselines) / len(baselines)
def trainClassifier(data, config): words = [] labels = [] for sentenceData in data: words += sentenceToDictList(sentenceData[0], config) labels += sentenceData[1] classifier = MaxentClassifier.train(zip(words,labels), algorithm, trace=0, max_iter=1000) return classifier
def trainMaxentropy(trainFeatures, trainLabels): import shorttext from shorttext.classifiers import MaxEntClassifier classifier = MaxEntClassifier() clf = make_pipeline(DictVectorizer(sparse=True), MaxentClassifier(encoding=None, weights=0)) scores = cross_val_score(clf, trainFeatures, trainLabels, cv=5) clf.fit(trainFeatures, trainLabels) return clf, scores.mean(), scores
def trainClassifier(data, config): words = [] labels = [] wordsSet = set() for sentenceDataList in data: for sentenceData in sentenceDataList: wordsSet |= set(sentenceData[0]) words += sentenceToDictList(sentenceData[0], config) labels += sentenceData[1] classifier = MaxentClassifier.train(zip(words,labels), algorithm, trace=0, max_iter=1000) return (classifier, wordsSet)
def train(self): for sentence,tags in self.datasource: sentence_processed = self.nlp(u' '.join(sentence)) for token in range(len(sentence)): self.featuresets.append((features.feature_compiler(token,sentence_processed),tags[token])) train_set, test_set = self.featuresets[0:-1000], self.featuresets[-1000:] pprint(train_set[:10]) self.classifier = MaxentClassifier.train(train_set) #Saving the classifier self.save()
def main(): me=Classifier() feature_counter=Counter() feature_set=pickle.load(open('undersampled_emoticon.pkl', 'rb')) feature_list=chain.from_iterable([word_tokenize(process_tweet(tweet)) for tweet, sentiment in feature_set]) for feat in feature_list: feature_counter[feat]+=1 me.feature_list=[feat for feat, count in feature_counter.most_common(1000)] ts=[(me.extract_features(tweet), label) for tweet, label in feature_set] print 'training Maxent, algorithm CG' me.classifier=MaxentClassifier.train(ts) return me
def training(list_filename, model_name): # training in large data context_data = [] for filename in list_filename: json_objects = read_jsonfile(filename) for json_object in json_objects: context_data.extend(get_context_sentence(json_object, 0)) print('Done get contexts') m = MaxentClassifier.train(context_data, max_iter=100) with open(model_name, 'wb') as fmodel: pickle.dump(m, fmodel) print('Finish training maxent model')
def trainGenderClassifier(model="NB"): my_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) shuffle(my_names) train_set = [(gender_features(n), g) for (n, g) in my_names] if model == "NB": nb_classifier = NaiveBayesClassifier.train(train_set) joblib.dump(nb_classifier, 'nb_gender_classifier.pkl') elif model == "ME": me_classifier = MaxentClassifier.train(train_set, "megam") joblib.dump(me_classifier, 'me_gender_classifier.pkl') else: raise ValueError( "Enter Model Type: Naive Bayes (NB) or Maximum Entropy (ME)")
def train(self, corpus_path, model_path): with open(corpus_path) as corpus_file: corpus = pickle.load(corpus_file) train_set = [] for row in corpus: sentence = [value for (value, _) in row] history = [] for i, (value, column) in enumerate(row): feature_set = self.db_row_features(sentence, i, history) train_set.append((feature_set, column)) history.append(column) classifier = MaxentClassifier.train(train_set, max_iter=20) with open(model_path, "wb") as model_file: pickle.dump(classifier, model_file)
def model_dev(func_name): from nltk.corpus import names names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(names) print "Length of dataset %d"%len(names) random.shuffle(names) random.shuffle(names) print "How the data set looks" print names[0:10] print "Testing the output of feature extraction" print "For name Gary -- %s"%func_name('Gary') featuresets = [(func_name(n), g) for (n, g) in names] print "length of featureset data %d"%len(featuresets) print featuresets[0:10] train_set, test_set = featuresets[500:], featuresets[:500] print "Length of train data %d"%len(train_set) print "length of test data %d"%len(test_set) time.sleep(10) os.system('clear') print "\n\nNaive Bayes Classification\n\n" nb_classifier = NaiveBayesClassifier.train(train_set) check_list=['Gary', 'Shivam', 'Grace', 'Sarah', 'Shaym', 'Richa', 'Abhisheyk'] for name in check_list: print "Naive gender classification of ---%s --is-- %s---"%(name,nb_classifier.classify(func_name(name))) print "The accuracy of the naive classifier is" print classify.accuracy(nb_classifier, test_set) print "The most informative features are:" print nb_classifier.show_most_informative_features(5) time.sleep(10) os.system('clear') print "\n\nMaxent Classification\n\n" mod=MaxentClassifier.train(train_set) for name in check_list: print "Maxent gender classification of ---%s --is-- %s---"%(name,mod.classify(func_name(name))) print "The accuracy of maxent is" print classify.accuracy(mod, test_set) print "The most informative features are:" print mod.show_most_informative_features(5)
def train(features, samples_proportion, classifier_choose): train_size = int(len(features) * samples_proportion) train_set, test_set = features[:train_size], features[train_size:] print('Training set size = ' + str(len(train_set)) + ' emails') print('Test set size = ' + str(len(test_set)) + ' emails') classifier = NaiveBayesClassifier.train(train_set) if classifier_choose == 1: classifier = NaiveBayesClassifier.train(train_set) elif classifier_choose == 2: classifier = SklearnClassifier(MultinomialNB()).train(train_set) elif classifier_choose == 3: classifier = SklearnClassifier(GaussianNB()).train(train_set) elif classifier_choose == 4: classifier = SklearnClassifier(BernoulliNB()).train(train_set) elif classifier_choose == 5: classifier = SklearnClassifier(SVC(), sparse=False).train(train_set) elif classifier_choose == 6: #Bisa pilih algorithm ama masukin parameter ketigas buat tentuin brapa kali iterasi #Makin banyak iterasi makin accurassi makin bagus (mungkin masih gk yakin) classifier = MaxentClassifier.train(train_set, MaxentClassifier.ALGORITHMS[0]) return train_set, test_set, classifier
def trainMaxEnt(fp): """ Function that extracts all features from a training json file at [fp], train a MaxEnt classifier based on these features, and return that classifier """ with open(fp, "r") as fileHandle: test_set = json.load(fileHandle) fileHandle.close() maxEntCorpus = [] print("Extracting features from training file...") for title in test_set["data"]: for paragraph in title["paragraphs"]: sents = sent_tokenize(paragraph["context"]) for question in paragraph["qas"]: q = question["question"] simFeature, candSent = genSimFeature(q, sents) atFeature = matchAT(extractAT(q), candSent) focusFeature = genFocusFeature(q, sents) features = { simFeature: True, atFeature: True, focusFeature: True } if question["is_impossible"]: maxEntCorpus.append((features, 0)) else: maxEntCorpus.append((features, 1)) return MaxentClassifier.train(maxEntCorpus, max_iter=30)
def training_weight_iis(self, paragraph): train = [] for index, data in enumerate(paragraph): sentence = sent_tokenize(data) # 1. Pemecahan paragraf kedalam kalimat for index, data in enumerate(sentence): # 2. Convert sentence to lower sent_lower = data.lower() # 3. Convert terbilang ke angka sent_conv = self.func.terbilang_to_number(sent_lower) print "training kata [%s]"%sent_conv # 4. Stemming tokenize = word_tokenize(sent_conv) div_sentence = [] for data in tokenize: if "/" not in data: # ubah menjadi kata dasar sent_stem = self.stemmer.stem(data) data = sent_stem elif "/con" in data: # ubah menjadi kata dasar kemudian dicocokan kedalam gazeter kondisi sent_stem = self.stemmer.stem(self.w.search(data).group(1)) data = sent_stem+"/CON" elif "/" in data: word = self.w.search(data).group(1) label = self.lbl.search(data).group(1) data = word+"/"+label.upper() div_sentence.append(data) train.append(" ".join(div_sentence)) #print train #melakukan training dengan sentence yang sudah diubah kedalam kata dasar me_classifier = MaxentClassifier.train(self.binary_feature(train, "train_iis"), 'iis', trace=100, max_iter=2000, min_lldelta=0.5) #print me_classifier.show_most_informative_features() return me_classifier
keep_dup=False) print(len(X_train)) X_test, Y_test = get_data_for_cognitive_classifiers( threshold=[0.75], what_type=['ada', 'os', 'bcl'], what_for='test', keep_dup=False) print('Loaded/Preprocessed data') train_set = [(features(X_train[i]), Y_train[i]) for i in range(len(X_train))] test_set = [(features(X_test[i]), Y_test[i]) for i in range(len(X_test))] if TRAIN: classifier = MaxentClassifier.train(train_set, max_iter=100) classifier.predict_proba = classifier.prob_classify pickle.dump( classifier, open( os.path.join(os.path.dirname(__file__), 'models/MaxEnt/maxent.pkl'), 'wb')) if not TRAIN: classifier = pickle.load( open( os.path.join(os.path.dirname(__file__), 'models/MaxEnt/maxent_85.pkl'), 'rb')) pred = [] actual = [x[1] for x in test_set]
if sent: check = True for char in sent: if char not in alphabet: check = False if check: sent = sent + stop_char sentences.append(sent) print('%d sentences after cleanup' %(len(sentences))) print("") print('getting data...') sys.stdout.flush() train_data = get_training_data(sentences) print('done.') print('training model...') sys.stdout.flush() model = MaxentClassifier.train(train_data, labels=alphabet) print('done.') print('pickling...') sys.stdout.flush() with open('maxentmodel.pickle', 'wb') as f: pickle.dump(model, f) print('done') sys.stdout.flush() end = time.time() print('%.2f seconds' %(end-start)) sys.stdout.flush()
testSet.append(each[0]) #change these ints to change the test entries: should be safe up to 500 or so #first one is used by bayes, second is used by maxent a = 10 b = 111 print('Training Naive Bayes') bayesClassifier = NaiveBayesClassifier.train(trainingSet) print('Naive Bayes training complete') print('Naive Bayes most important features:') bayesClassifier.show_most_informative_features(5) print('Dialogue:') print(prelimTestSet[a]) print('Bayes classification:') print(bayesClassifier.classify(testSet[a])) print('Training Maximum Entropy') maxEntClassifier = MaxentClassifier.train(trainingSet, max_iter=30) print('Maximum Entropy training complete') print('Maximum Entropy most important features:') maxEntClassifier.show_most_informative_features(5) print('Dialogue:') print(prelimTestSet[b]) print('MaxEnt classification:') print(maxEntClassifier.classify(testSet[b]))
if len(args) > 0: path = args[0] else: print "Usage: python train.py -m <path/to/model/file> path/to/training/data" sys.exit(2) # Check the path is exists or not? if not os.path.exists(path): print "The path \'%s\' is not exist. Try again!" % path sys.exit(2) elif not os.path.isfile(path): print "The path \'%s\' is not a file. Try again!" % path sys.exit(2) # Load dataset print "Loading training data..." dataset = np.load(path) # Training processing print "Training Maximum Entropy Model from the dataset \'%s\'" % path maxent = MaxentClassifier.train(dataset, max_iter=10) # Save model print "Saving model into file %s" % model with io.open(model, 'wb') as fmodel: pickle.dump(maxent, fmodel) # Finished? print "DONE!!"
f.write(tag) f.write("\n") if __name__ == "__main__": # load files trainfilePath = "CONLL_NAME_CORPUS_FOR_STUDENTS/CONLL_train.pos-chunk-name" testfilePath = "CONLL_NAME_CORPUS_FOR_STUDENTS/CONLL_dev.pos-chunk" predictfilePath = "CONLL_NAME_CORPUS_FOR_STUDENTS/CONLL_test.pos-chunk" trainwords, trainTokenList = loadTrainData(trainfilePath) trainWord2VecFeature = generateWord2Vec(trainwords) testwords, testTokenList = loadTestData(testfilePath) testWord2VecFeature = generateWord2Vec(testwords) predictwords, predictTokenList = loadTestData(predictfilePath) predictWord2VecFeature = generateWord2Vec(predictwords) # train model trainToks = create_trainToks(trainTokenList,trainWord2VecFeature) model = MaxentClassifier.train(trainToks) # predict testFratureSet = create_testFeatureSet(testTokenList,testWord2VecFeature) labels = predict(model,testFratureSet) write_out(labels,"response.name") predictFeatureSet = create_testFeatureSet(predictTokenList, predictWord2VecFeature) labels = predict(model,predictFeatureSet) write_out(labels,"CONLL_test.name")
# /usr/bin/env python3 # -*- coding:utf-8 -*- import sys import math from collections import defaultdict from nltk import MaxentClassifier # play outlook temperature humidity windy maxent = MaxentClassifier() class MaxEnt: def __init__(self): self._samples = [] # 样本集, 元素是[y,x1,x2,...,xn]的元组 self._Y = set([]) # 标签集合,相当于去重之后的y self._numXY = defaultdict(int) # Key是(xi,yi)对,Value是count(xi,yi) self._N = 0 # 样本数量 self._n = 0 # 特征对(xi,yi)总数量 self._xyID = {} # 对(x,y)对做的顺序编号(ID), Key是(xi,yi)对,Value是ID self._C = 0 # 样本最大的特征数量,用于求参数时的迭代,见IIS原理说明 self._ep_ = [] # 样本分布的特征期望值 self._ep = [] # 模型分布的特征期望值 self._w = [] # 对应n个特征的权值 self._lastw = [] # 上一轮迭代的权值 self._EPS = 0.01 # 判断是否收敛的阈值 def load_data(self, filename): for line in open(filename, "r"): sample = line.strip().split("\t") if len(sample) < 2: # 至少:标签+一个特征
for index, tuples in df[["words", "pos", "tags"]].iterrows(): word_tuple, pos_tuple, tag_tuple = tuples word_num = 0 prev_tag = prev_tag1 = prev_tag2 = None for word_num in range(len(word_tuple)): feature = (extract_features(word_num, word_tuple, pos_tuple, [prev_tag2, prev_tag1, prev_tag]), tag_tuple[word_num]) features.append(feature) prev_tag = tag_tuple[word_num] prev_tag1 = prev_tag prev_tag2 = prev_tag1 # In[11]: memm_classifier = MaxentClassifier.train(features, "megam") # ## Testing Phase # In[12]: text = open("./goldoutput.txt").read() lines = [y.strip() for y in text.split("\n\n")] test_df = pd.DataFrame(lines, columns=["sentence"]) # test_df = dev_df.copy() test_df.loc[:, "sentence_token"] = test_df["sentence"].apply( lambda x: tuple(y.split("\t") for y in x.split("\n"))) test_df.loc[:, "words_"] = test_df["sentence_token"].apply( lambda x: tuple(y[1] for y in x)) test_df.loc[:, "pos"] = test_df["words_"].apply( lambda x: tuple(x[1] for x in nltk.pos_tag(x)))
def build_model(training_features,preprocessed_validation_data ): algorithm = MaxentClassifier.ALGORITHMS[0] MaxEntClassifier = MaxentClassifier.train(training_features, algorithm,max_iter=10) predictions = [MaxEntClassifier.classify(extract_tweet_features(tweet[0])) for tweet in preprocessed_validation_data] return MaxEntClassifier, predictions
def main(): # if preprocessed data was stored previously, just load it if os.path.isfile('./data/processed/preptrainingdata.pickle') \ and os.path.isfile('./data/processed/preptestdata.pickle'): preptrainingdata_f = open('./data/processed/preptrainingdata.pickle', 'r') preptrainingdata = pickle.load(preptrainingdata_f) preptestdata_f = open('./data/processed/preptestdata.pickle', 'r') preptestdata = pickle.load(preptestdata_f) preptrainingdata_f.close() preptestdata_f.close() else: # preprocess training and test data and store them trainingdatapath = './data/original/origintrainingdata.csv' testdatapath = './data/original/origintestdata.csv' preprocessor = Preprocessor(trainingdatapath, testdatapath) [training, test] = preprocessor.read_data(2000, 2000) # preprocessing step for row in training+test: row[0] = preprocessor.preprocess(row[0]) preptrainingdata = training preptestdata = test # store preprocessed training data save_documents = open('./data/processed/preptrainingdata.pickle', 'w') pickle.dump(preptrainingdata, save_documents) save_documents.close() # store preprocessed test data save_documents = open('./data/processed/preptestdata.pickle', 'w') pickle.dump(preptestdata, save_documents) save_documents.close() if os.path.isfile('./data/processed/trainingfeaset.pickle') \ and os.path.isfile('./data/processed/testfeaset.pickle')\ and os.path.isfile('./data/processed/word_features.pickle'): trainingfeaset_f = open('./data/processed/trainingfeaset.pickle', 'r') trainingfeaset = pickle.load(trainingfeaset_f) testfeaset_f = open('./data/processed/testfeaset.pickle', 'r') testfeaset = pickle.load(testfeaset_f) word_features_f = open('./data/processed/word_features.pickle', 'r') word_features = pickle.load(word_features_f) trainingfeaset_f.close() testfeaset_f.close() word_features_f.close() else: # feature extraction and feature set construction and store them fea_extractor = FeatureExtractor() all_words = [] for row in preptrainingdata+preptestdata: all_words.extend(fea_extractor.getfeavector(row[0])) word_features = fea_extractor.getfeatures(all_words, 4000) del all_words # release some memory trainingfeaset = [(fea_extractor.construct_feaset(row[0], word_features), row[1]) for row in preptrainingdata] testfeaset = [(fea_extractor.construct_feaset(row[0], word_features), row[1]) for row in preptestdata] # random.shuffle(trainingfeaset) # random.shuffle(testfeaset) save_documents = open('./data/processed/word_features.pickle', 'w') pickle.dump(word_features, save_documents) save_documents.close() save_documents = open('./data/processed/trainingfeaset.pickle', 'w') pickle.dump(trainingfeaset, save_documents) save_documents.close() save_documents = open('./data/processed/testfeaset.pickle', 'w') pickle.dump(testfeaset, save_documents) save_documents.close() # Naive Bayes if os.path.isfile('./data/processed/NB_classifier.pickle'): NB_classifier_f = open("./data/processed/NB_classifier.pickle", "r") NB_classifier = pickle.load(NB_classifier_f) NB_classifier_f.close() else: NB_classifier = nltk.NaiveBayesClassifier.train(trainingfeaset) save_classifier = open("./data/processed/NB_classifier.pickle", "w") pickle.dump(NB_classifier, save_classifier) save_classifier.close() print("Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(NB_classifier, testfeaset)) * 100) print NB_classifier.show_most_informative_features(10) # Maximum Entropy if os.path.isfile('./data/processed/MaxEntClassifier.pickle'): MaxEntClassifier_f = open('./data/processed/MaxEntClassifier.pickle','r') MaxEntClassifier = pickle.load(MaxEntClassifier_f) MaxEntClassifier_f.close() else: MaxEntClassifier = MaxentClassifier.train(trainingfeaset, algorithm='GIS', max_iter=10) save_classifier = open("./data/processed/MaxEntClassifier2.pickle", "w") pickle.dump(MaxEntClassifier, save_classifier) save_classifier.close() print "MaxEnt Classifier accuracy percent:", nltk.classify.accuracy(MaxEntClassifier, testfeaset) print MaxEntClassifier.show_most_informative_features(10) fea_extractor = FeatureExtractor() trainingset = fea_extractor.construct_svm_feaset(preptrainingdata, word_features) problem = svm_problem(trainingset['labels'], trainingset['feature_vectors']) param = svm_parameter('-q') param.kernel_type = LINEAR svm_classifier = svm_train(problem, param) svm_save_model('./data/svm_classifier', svm_classifier) testset = fea_extractor.construct_svm_feaset(preptestdata, word_features) p_labels, p_accs, p_vals = svm_predict(testset['labels'], testset['feature_vectors'], svm_classifier) print p_labels print p_accs
def train(self): feature_set = list() for prop in self.word_prop: for feats in self.word_prop[prop]: feature_set.append((feats, prop)) self.model = MaxentClassifier.train(feature_set, "gis", max_iter=10)
## lists # merge lists: http://stackoverflow.com/questions/252703/python-append-vs-extend x = [1,2,3] x.append([4,5]) # append a list as an item in list x x.extend([4,5]) # adds 4 and 5 as separate elements in list x # frequency count over a list from collections import Counter Counter(['apple','red','apple','red','red','pear']) ## to train Max Entropy classifier using MegaM from nltk import MaxentClassifier nltk.config_megam('/Users/andrewcaines/Downloads/megam_0.92/megam') classifier = MaxentClassifier.train(trainfeats, 'megam') ## strings with (u'x') for unicode [item.decode('UTF-8') if isinstance(item, basestring) else item for item in listx]) ## range of numbers range(0, 10) ## sequence of numbers import numpy numpy.arange(0, 10, 2) ## average numpy.mean([1, 2, 3])
features = {} features["firstletter"] = name[0].lower() features["lastletter"] = name[-1].lower() for letter in 'abcdefghijklmnopqrstuvwxyz': features["count(%s)" % letter] = name.lower().count(letter) features["has(%s)" % letter] = (letter in name.lower()) return features def gender_features3(word): return {'suffix1': word[-1:], 'suffix2': word[-2:]} if __name__ == '__main__': print("Lab 3 - Exercise 2") data = get_data() train_set = apply_features(gender_features3, data[500:]) test_set = apply_features(gender_features3, data[:500]) print("Training classifiers") # Train the different classifiers on the training set classifier = [(NaiveBayesClassifier.train(train_set), "NaiveBayes"), (DecisionTreeClassifier.train(train_set), "DecisionTree"), (MaxentClassifier.train(train_set, max_iter=10, trace=0), "MaxEntropy")] # Test all classifiers on the test set for classifier, name in classifier: acc = accuracy(classifier, test_set) print("{} classifier test accuracy: {}".format(name, acc))
def main(): data = [] with open('data-1_train.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: data.append(row) fields = data[0] data = np.array(data[1:], dtype=object) print(data.shape, fields) words = filterData(data) print(words.shape) x_train = [] y_train = [] x_train_aspect = [] for i in range(len(data)): x_train.append(words[i][1]) y_train.append(data[i][4]) x_train_aspect.append(data[i][2]) x_train = np.array(x_train) y_train = np.array(y_train) x_train_aspect = np.array(x_train_aspect) print('here') print(x_train[0]) print('here') print(y_train[0:10]) features = Features(x_train, x_train_aspect) print('printing features') print(features) print('Length: ', len(features), type(features)) features = set(features) print('Length2: ', len(features)) # 10-Fold Cross Validation kf = KFold(n_splits=10) kf.get_n_splits(x_train) for train_index, test_index in kf.split(x_train): print(type(train_index)) print(type(x_train)) errors = 0 x_train_kf, x_test_kf = x_train[train_index], x_train[test_index] y_train_kf, y_test_kf = y_train[train_index], y_train[test_index] print(type(x_train_aspect)) x_train_aspect_kf = x_train_aspect[train_index] fv = Features(x_train_kf, x_train_aspect_kf) x_train_maxent = train_data(fv, x_train_kf, y_train_kf) print('Train feature vectors created') x_test_maxent = test_data(fv, x_test_kf, y_test_kf) print('Test feature vectors created') mec = MaxentClassifier.train(x_train_maxent) print('train finish') for featureset, label in zip(x_test_maxent, y_test_kf): if (mec.classify(featureset) != label): errors += 1 print("Accuracy: %f" % (1 - (errors / float(len(y_test_kf)))))
import pickle if __name__ == "__main__": ''' init the program, prepare input, output ''' in_file = codecs.open(sys.argv[1],encoding='utf-8',mode='r') ''' label our train data ''' lines = in_file.readlines() labeled_entries = flat_list(map(get_labeled, lines)) ''' train a classifier ''' mx_classifier = MaxentClassifier.train(labeled_entries); ''' save the classifier to the disk ''' mx_file = open('mx_classifier.pkl', 'wb') pickle.dump(mx_classifier, mx_file) mx_file.close() in_file.close() mx_classifier.show_most_informative_features(5) | stdout
def _train(self, txs, tys): #rid2shard = ST.random_shardlize(10, len(self._train_xs)) train_set = [(self._feature_encoding(txt), tag) for txt, tag in zip(txs, tys)] return MaxentClassifier.train(train_set, algorithm='iis', max_iter=4)
def __init__(self): self.train_set, self.test_set = utils.chunked_training_dataset() print('Ngram chunk tagger training started') self.classifier = MaxentClassifier.train( self.__transformed_training_set()) print('Ngram chunk tagger training completed')
else: features['left_neighbor_len'] = "0" features['left_neighbor_digit'] = "False" features['left_neighbor_title'] = "False" if nxt != PARAGRAPH: features['right_neighbor_len'] = "%s" % len(nxt) features['right_neighbor_digit'] = "%s" % isdigit(nxt) features['right_neighbor_title'] = "%s" % title(nxt) features['paragraph_end'] = "False" else: features['right_neighbor_len'] = "0" features['right_neighbor_digit'] = "False" features['right_neighbor_title'] = "False" features['paragraph_end'] = "True" return features if __name__ == "__main__": tree = "try_train.xml" data = collect_classified_data(tree) train_set, test_set = data, data me_classifier = MaxentClassifier.train(train_set) test_ex = test_set[0][0] print(test_ex) print(me_classifier.classify(test_ex))
print(gender_features('Gary')) featuresets = [(gender_features(n), g) for (n, g) in names] print(featuresets[0:10]) train_set, test_set = featuresets[500:], featuresets[:500] print(len(train_set), len(test_set)) nb_classifier = NaiveBayesClassifier.train(train_set) print(nb_classifier.classify(gender_features('Gary'))) print(nb_classifier.classify(gender_features('Grace'))) print(classify.accuracy(nb_classifier, test_set)) nb_classifier.show_most_informative_features(5) me_classifier = MaxentClassifier.train(train_set) me_classifier.classify(gender_features('Gary')) me_classifier.classify(gender_features('Grace')) print(classify.accuracy(me_classifier, test_set)) me_classifier.show_most_informative_features(5) def gender_features2(name): features = {} features["firstletter"] = name[0].lower() features["lastletter"] = name[-1].lower() for letter in 'abcdefghijklmnopqrstuvwxyz': features["count(%s)" % letter] = name.lower().count(letter) features["has(%s)" % letter] = (letter in name.lower()) return features
def main(): # for feature extraction fea_extractor = FeatureExtractor() # if preprocessed data was stored previously, just load it # for what is mean by "preprocessed", refer to preprocess method in preproc_fea_extraction.py if os.path.isfile('./data/processed/preptrainingdata.pickle') \ and os.path.isfile('./data/processed/preptestdata.pickle'): print "preptrainingdata and preptestdata detected, load files..." preptrainingdata_f = open('./data/processed/preptrainingdata.pickle', 'r') preptrainingdata = pickle.load(preptrainingdata_f) preptestdata_f = open('./data/processed/preptestdata.pickle', 'r') preptestdata = pickle.load(preptestdata_f) preptrainingdata_f.close() preptestdata_f.close() else: print "no preptrainingdata and preptestdata detected, create from scratch..." # preprocess training and test data and store them trainingdatapath = './data/original/origintrainingdata.csv' testdatapath = './data/original/origintestdata.csv' preprocessor = Preprocessor(trainingdatapath, testdatapath) [training, test] = preprocessor.read_data(2000, 2000) print "reading training data and all test data done..." print "length of training", len(training) # preprocessing step for row in training+test: row[0] = preprocessor.preprocess(row[0]) preptrainingdata = training preptestdata = test print "preprocessing done..." # store preprocessed training data save_documents = open('./data/processed/preptrainingdata.pickle', 'w') pickle.dump(preptrainingdata, save_documents) save_documents.close() # store preprocessed test data save_documents = open('./data/processed/preptestdata.pickle', 'w') pickle.dump(preptestdata, save_documents) save_documents.close() # if training feature set and test feature set are stored previously, just load them # these feature set are used by Naive Bayes and Maximum Entropy # word_features contains the names of features (which are words) # e.g. a word is a feature, feature name is the word, value is True or False if os.path.isfile('./data/processed/trainingfeaset.pickle') \ and os.path.isfile('./data/processed/testfeaset.pickle')\ and os.path.isfile('./data/processed/word_features.pickle'): print "trainingfeaset, testfeaset and word_features detected, load files..." trainingfeaset_f = open('./data/processed/trainingfeaset.pickle', 'r') trainingfeaset = pickle.load(trainingfeaset_f) testfeaset_f = open('./data/processed/testfeaset.pickle', 'r') testfeaset = pickle.load(testfeaset_f) word_features_f = open('./data/processed/word_features.pickle', 'r') word_features = pickle.load(word_features_f) trainingfeaset_f.close() testfeaset_f.close() word_features_f.close() else: print "no trainingfeaset, testfeaset and word_features detected, create from scratch..." # feature extraction and feature set construction and store them all_words = [] for row in preptrainingdata+preptestdata: all_words.extend(fea_extractor.get_feavector(row[0])) print "generating all_words done..." print "start generating word_features..." # set desired # of features in the second parameter word_features = fea_extractor.get_features(all_words, 5000) print "generating word_features done..." del all_words # release some memory trainingfeaset = [(fea_extractor.construct_feaset(row[0], word_features), row[1]) for row in preptrainingdata] testfeaset = [(fea_extractor.construct_feaset(row[0], word_features), row[1]) for row in preptestdata] print "generating trainingfeaset and testfeaset done... great progress!" # random.shuffle(trainingfeaset) # random.shuffle(testfeaset) save_documents = open('./data/processed/word_features.pickle', 'w') pickle.dump(word_features, save_documents) save_documents.close() save_documents = open('./data/processed/trainingfeaset.pickle', 'w') pickle.dump(trainingfeaset, save_documents) save_documents.close() save_documents = open('./data/processed/testfeaset.pickle', 'w') pickle.dump(testfeaset, save_documents) save_documents.close() print "storing training and test featureset files done..." # Naive Bayes print "Naive Bayes start..." if os.path.isfile('./data/processed/NB_classifier.pickle'): NB_classifier_f = open("./data/processed/NB_classifier.pickle", "r") NB_classifier = pickle.load(NB_classifier_f) NB_classifier_f.close() else: start = time.time() NB_classifier = nltk.NaiveBayesClassifier.train(trainingfeaset) NB_trainingtime = time.time() - start print "Naive Bayes training time:", NB_trainingtime save_classifier = open("./data/processed/NB_classifier.pickle", "w") pickle.dump(NB_classifier, save_classifier) save_classifier.close() print "Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(NB_classifier, testfeaset)) * 100 print NB_classifier.show_most_informative_features(10) # Maximum Entropy print "Maximum Entropy start..." if os.path.isfile('./data/processed/MaxEntClassifier.pickle'): MaxEntClassifier_f = open('./data/processed/MaxEntClassifier.pickle','r') MaxEntClassifier = pickle.load(MaxEntClassifier_f) MaxEntClassifier_f.close() else: start = time.time() MaxEntClassifier = MaxentClassifier.train(trainingfeaset, algorithm='GIS', max_iter=10) MaxEnt_trainingtime = time.time() - start print "Maximum Entropy training time:", MaxEnt_trainingtime save_classifier = open("./data/processed/MaxEntClassifier.pickle", "w") pickle.dump(MaxEntClassifier, save_classifier) save_classifier.close() print "MaxEnt Classifier accuracy percent:", nltk.classify.accuracy(MaxEntClassifier, testfeaset) print MaxEntClassifier.show_most_informative_features(10) # SVM print "SVM start..." testset = fea_extractor.construct_svm_feaset(preptestdata, word_features) if os.path.isfile('./data/processed/svm_classifier.model'): svm_classifier = svm_load_model('./data/processed/svm_classifier.model') else: trainingset = fea_extractor.construct_svm_feaset(preptrainingdata, word_features) problem = svm_problem(trainingset['labels'], trainingset['feature_vectors']) param = svm_parameter('-q') param.kernel_type = LINEAR start = time.time() svm_classifier = svm_train(problem, param) svm_trainingtime = time.time() - start print "SVM training time:", svm_trainingtime svm_save_model('./data/processed/svm_classifier.model', svm_classifier) p_labels, p_accs, p_vals = svm_predict(testset['labels'], testset['feature_vectors'], svm_classifier) print p_labels print p_accs
def train_classifier(data): me_classifier = MaxentClassifier.train(train_set) return me_classifier