def getGlobalFeatureVector(self): globalFV = FeatureVector() curState = self while curState.localFV != None: globalFV.add(curState.localFV) curState = curState.prevState return globalFV
def getFeatureVector(self, features_list): """ :param features_list:list(string) :return:FeatureVector """ fv = FeatureVector() for feature in features_list: if feature in self.featureIndexer.ObjectToIndex: fv.add(self.featureIndexer.ObjectToIndex[feature]) return fv
def main(): print ("SVM Approach") print ("Generating messages ...") feature_vector = FeatureVector(SMS_COLLECTION) feature_vector.data_process(sep='\t') messages = feature_vector.messages print "Splitting into train and cross-validation sets ..." msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2) print len(msg_train), len(msg_test), len(msg_train) + len(msg_test) print msg_train.shape, msg_test.shape print "Creating Pipeline for the analyzing and training ..." pipeline = Pipeline([ ('bow', CountVectorizer(analyzer=split_into_lemmas)), # strings to token integer counts ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores ('classifier', SVC()), # train on TF-IDF vectors w/ Naive Bayes classifier ]) # pipeline parameters to automatically explore and tune param_svm = [ {'classifier__C': [1, 10, 100, 1000], 'classifier__kernel': ['linear']}, {'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']}, ] print("pipeline:", [name for name, _ in pipeline.steps]) for name, v in pipeline.steps: print name print v grid_svm = GridSearchCV( pipeline, # pipeline from above param_grid=param_svm, # parameters to tune via cross validation refit=True, # fit using all data, on the best detected classifier n_jobs=-1, # number of cores to use for parallelization; -1 for "all cores" scoring='accuracy', # what score are we optimizing? cv=StratifiedKFold(label_train, n_folds=5), # what type of cross validation to use ) svm_detector = grid_svm.fit(msg_train, label_train) # find the best combination from param_svm print "\nScores for various cases ..." for i in xrange(len(svm_detector.grid_scores_)): print svm_detector.grid_scores_[i] curve = plot_learning_curve(pipeline, "accuracy vs. training set size", msg_train, label_train, cv=5) curve.savefig("./plots/acc-vs-trainSize_SVM.png") pipeline.fit(msg_train, label_train) #trained here print "Score in 20% of test dataset" test_predictions = svm_detector.predict(msg_test) print 'accuracy', accuracy_score(label_test, test_predictions) print 'confusion matrix\n', confusion_matrix(label_test, test_predictions) print '(row=expected, col=predicted)' print classification_report(label_test, test_predictions)
def Task(self, trainType, inputQueue, resultQueue): ''' parallelising function ''' while 1: inputOne = inputQueue.get() # terminal signal if inputOne is None: break # update model paraments if inputOne[0] is None: self.model.setParam(inputOne[1]) continue # train decode if inputOne[1] is None: states = self.decodeBeamSearch(inputOne[0], trainType) # evaluate decode else: states = self.decodeBeamSearch(inputOne[1], "test") resultQueue.put((inputOne[0], states[1].getFinalResult())) continue gradient = FeatureVector() if trainType == 'MIRA': K = 0 # number of candidates for kk in xrange(1, len(states)): if states[kk] != None: K += 1 else: break b = [0.0 for kk in xrange(K)] lam_dist = [0.0 for kk in xrange(K)] dist = [FeatureVector() for kk in xrange(K)] goldFV = states[0].getGlobalFeatureVector() for kk in xrange(K): # the score difference between # gold-standard tree and auto tree lam_dist[kk] = (states[0].getScore() - states[kk+1].getScore()) b[kk] = self.loss(states[0], states[kk+1]) b[kk] -= lam_dist[kk] #the FV difference dist[kk] = FeatureVector.getDistVector(goldFV, states[kk+1].getGlobalFeatureVector()) alpha = QPSolver.hildreth(dist, b) for kk in xrange(K): gradient.add(dist[kk], alpha[kk]) else: if not states[1].IsGold(): gradient.add(states[0].getGlobalFeatureVector()) gradient.subtract(states[1].getGlobalFeatureVector()) resultQueue.put(gradient)
def main(): print("DecisionTree Approach") print("Generating messages ...") feature_vector = FeatureVector(SMS_COLLECTION) feature_vector.data_process(sep='\t') messages = feature_vector.messages print "Splitting into train and cross-validation sets ..." msg_train, msg_test, label_train, label_test = train_test_split( messages['message'], messages['label'], test_size=0.2) print len(msg_train), len(msg_test), len(msg_train) + len(msg_test) print msg_train.shape, msg_test.shape print "\nCreating Pipeline for the analyzing and training ..." dt_old = Pipeline([ ('bow', CountVectorizer( analyzer=split_into_lemmas)), # strings to token integer counts ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores ('classifier', DecisionTreeClassifier(min_samples_split=20, random_state=99) ), # train on TF-IDF vectors w/ DecisionTree classifier ]) print("pipeline:", [name for name, _ in dt_old.steps]) print("-- 10-fold cross-validation , without any grid search") dt_old.fit(msg_train, label_train) scores = cross_val_score(dt_old, msg_train, label_train, cv=10) print "mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std()) from sklearn.externals.six import StringIO import pydot dot_data = StringIO() classes = ["ham", "spam"] vocab = dt_old.named_steps['bow'].get_feature_names() vocab1 = [v.encode('ascii', 'ignore') for v in vocab] # print "vocab: ", vocab1 with open("./plots/heme.dot", "w") as f: export_graphviz(dt_old.named_steps['classifier'], out_file=f, max_depth=13, feature_names=vocab1) print("Creating a visualization of decision tree") # graph = pydot.graph_from_dot_data(dot_data.getvalue()) # graph.write_pdf("./plots/heme.pdf") print "\nScore in 20% of test dataset" test_predictions = dt_old.predict(msg_test) print 'accuracy', accuracy_score(label_test, test_predictions) print 'confusion matrix\n', confusion_matrix(label_test, test_predictions) print '(row=expected, col=predicted)' print classification_report(label_test, test_predictions)
def makeVector(w, lang=None): fv = FeatureVector() for ind, c in enumerate(w): fv.add(str(ind) + "-" + c) for ind, f in enumerate(splitWord(w,2)): fv.add(str(ind) + "-" + f) for ind, f in enumerate(splitWord(w, 3)): fv.add(str(ind) + "-" + f) if lang is not None: fv.setLabel(lang) return fv
def main(): feature_vector = FeatureVector(SMS_COLLECTION) feature_vector.data_process(sep='\t') messages = feature_vector.messages feature_vector.transformer() bow_transformer = feature_vector.bow_transformer messages_bow = feature_vector.messages_bow print "Describing the messages ..." print messages.groupby('label').describe() print 'sparse matrix shape:', messages_bow.shape print 'number of non-zeros:', messages_bow.nnz print 'sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])) print "TF_IDF normalization ... " tfidf_transformer = TfidfTransformer().fit(messages_bow) messages_tfidf = tfidf_transformer.transform(messages_bow) print "transform all ham/spam ... ", messages_tfidf.shape spam_detector = MultinomialNB().fit(messages_tfidf, messages['label']) all_predictions = spam_detector.predict(messages_tfidf) print 'accuracy', accuracy_score(messages['label'], all_predictions) print 'confusion matrix\n', confusion_matrix(messages['label'], all_predictions) print '(row=expected, col=predicted)' print classification_report(messages['label'], all_predictions) test_bow_transformer(messages, bow_transformer, tfidf_transformer, spam_detector)
def main(): print("Naive-Bayes Approach") print "Generating messages ..." feature_vector = FeatureVector(SMS_COLLECTION) feature_vector.data_process(sep='\t') messages = feature_vector.messages print "Splitting into train and cross-validation sets ..." msg_train, msg_test, label_train, label_test = train_test_split( messages['message'], messages['label'], test_size=0.2) print len(msg_train), len(msg_test), len(msg_train) + len(msg_test) print msg_train.shape, msg_test.shape print "Creating Pipeline for the analyzing and training ..." pipeline = Pipeline([ ('bow', CountVectorizer( analyzer=split_into_lemmas)), # strings to token integer counts ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores ('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier ]) print(pipeline) curve = plot_learning_curve(pipeline, "accuracy vs. training set size", msg_train, label_train, cv=5) curve.savefig("./plots/acc-vs-trainSize_naive.png") pipeline.fit(msg_train, label_train) #trained here print "Score in 20% of test dataset" test_predictions = pipeline.predict(msg_test) print 'accuracy', accuracy_score(label_test, test_predictions) print 'confusion matrix\n', confusion_matrix(label_test, test_predictions) print '(row=expected, col=predicted)' print classification_report(label_test, test_predictions)
def train_sarsaLApprox_agent(n_iters, lam, record_history=False): # Create feature vector agent_features = [ range(1, 7), range(4, 10), range(7, 13), range(10, 16), range(13, 19), range(16, 22) ] dealer_features = [range(1, 5), range(4, 8), range(7, 11)] # Must pass agent features first since agent hand is first in state agent_feature_vector = FeatureVector(agent_features, dealer_features) # initialise sarsa agent sarsa_approx_agent = sarsaLApprox(agent_feature_vector, lam, gamma=1, n0=10) # Train agent for i in range(n_iters): # initialise the environment card_table = Environment() sarsa_approx_agent.init_etrace() sarsa_approx_agent.init_etrace_log() # game ends when terminal state is reached while card_table.is_state_terminal == False: s = card_table.state # agent takes action, gets reward a = sarsa_approx_agent.choose_action(s) s_, r = card_table.step(a) sarsa_approx_agent.update_value_function(s, a, r, s_) if record_history: sarsa_approx_agent.log_weights() # Return the trained agent return sarsa_approx_agent
def trainForStructureLinearModel(self, trainSet, devSet, Iter, miniSize, numThreads, trainType, evaluateWhileTraining): ''' Main training function Args: trainSet: SentenceReader, train set in the form of SentenceReader devSet: SentenceReader, dev set in the form of SentenceReader Iter: count of iteration miniSize: int, num of train samples per thread #useless numThreads: int, num of threads trainType: str, MIRA or Standard or ... evaluateWhileTraining: bool, true for eval while training Returns: None Raise: None ''' bestAccuracy = 0.0 bestIter = 0 bestParams = None trainSet.reset() sentences = [] while trainSet.hasNext(): sentences.append(trainSet.next()) # number of sentences for each batch num = miniSize * numThreads # number of batch for each iteration batchSize = int(math.ceil(1.0*len(sentences)/num)) print('Iterate %d times, ' 'the batch size for each iteration is %d'%(Iter, batchSize)) # build multi-process pool resultQueue = multiprocessing.Queue() inputQueue = multiprocessing.Queue() workerPool = [] for k in xrange(numThreads): worker = multiprocessing.Process(target=self.Task, args=(trainType, inputQueue, resultQueue)) worker.daemon = True workerPool.append(worker) for worker in workerPool: worker.start() # train iteration for it in xrange(Iter): print "Iteration %d\n Batch:"%it startTime = time.time() random.shuffle(sentences) for i in xrange(batchSize): # send model paraments for worker in workerPool: inputQueue.put([None, self.model.getParam()]) # send sentences start = num * i end = start + num end = min(end, len(sentences)) if i%10 == 0: print i, sys.stdout.flush() for k in xrange(start, end): inputQueue.put([sentences[k], None]) # calculate gradient gradient = FeatureVector() factor = 1.0/(end - start) # parse result for k in xrange(end - start): gradient.add(resultQueue.get(), factor) avg_upd = 1.0 * Iter * batchSize - (batchSize*(it-1)+(i+1)) + 1 # avg_upd = 1.0 self.model.perceptronUpdate(gradient, avg_upd) # batch iter end print '\nTrain Time: %f'%(time.time() - startTime) # evaluate and update model paraments if evaluateWhileTraining: startTime = time.time() averageParams = self.model.averageParam() # update model paraments for worker in workerPool: inputQueue.put([None, averageParams]) # evaluate averageParams accuracy = self.evaluate(devSet, numThreads, miniSize, inputQueue, resultQueue) print 'Dev Acc is %f'%accuracy if accuracy >= bestAccuracy: bestIter = it bestAccuracy = accuracy bestParams = averageParams print 'Eval time: %f'%(time.time() - startTime) # train iter end for k in xrange(numThreads): inputQueue.put(None) for worker in workerPool: worker.join() if bestParams: self.model.setParam(bestParams) print 'The best iteration is %d'%bestIter
def parse(line): parts = line.split('#')[0].strip().split(' ') rel = int(parts[0]) qid = int(parts[1].split(':')[1]) return FeatureVector(rel, qid, parts[2:])
def hildreth(fv, loss): '''迭代求解alpha参数数组 Args: fv:特征向量数组,是两个向量之间差分的结果 loss:损失分数数组,是一个数据集每个特征向量和最佳特征向量的评分损失值 Return: alpha:一组参数 Raise: None ''' LengthOfb = loss.__len__() alpha = [0.0] * LengthOfb F = [0.0] * LengthOfb kkt = [0.0] * LengthOfb # GramMatrix用于缓存向量数组的內积,用于降低时间复杂度 K = fv.__len__() GramMatrix = [[0] * K for i in range(K)] is_computed = [False] * K for i in range(K): GramMatrix[i][i] = FeatureVector().dotProduct(fv[i], fv[i]) # 寻找loss数组中最大数组项及其索引 max_kkt = float("-inf") max_kkt_i = -1 for i in range(LengthOfb): F[i] = loss[i] kkt[i] = F[i] if kkt[i] > max_kkt: max_kkt = kkt[i] max_kkt_i = i circle = 0 diff_alpha = 0.0 try_alpha = 0.0 add_alpha = 0.0 while max_kkt >= QPSolver.EPS and circle < QPSolver.MAX_ITER: # 更新loss最大项的alpha值 diff_alpha = F[max_kkt_i] / GramMatrix[max_kkt_i][max_kkt_i] if GramMatrix[max_kkt_i][max_kkt_i] <= QPSolver.ZERO: diff_alpha = 0.0 try_alpha = alpha[max_kkt_i] + diff_alpha if try_alpha < 0.0: add_alpha = -1.0 * alpha[max_kkt_i] else: add_alpha = diff_alpha alpha[max_kkt_i] += add_alpha # 提前计算好所用的向量內积 if not is_computed[max_kkt_i]: for i in range(K): GramMatrix[i][max_kkt_i] = FeatureVector().dotProduct( fv[i], fv[max_kkt_i]) is_computed[max_kkt_i] = True for i in range(LengthOfb): F[i] -= add_alpha * GramMatrix[i][max_kkt_i] kkt[i] = F[i] if alpha[i] > QPSolver.ZERO: kkt[i] = abs(F[i]) # 每次迭代都处理loss最大项 max_kkt = float("-inf") max_kkt_i = -1 for i in range(LengthOfb): if kkt[i] > max_kkt: max_kkt = kkt[i] max_kkt_i = i circle += 1 return alpha
def makeVector(w, lang): fv = FeatureVector() for f in splitWord(w,2): fv.add(f) fv.setLabel(lang) return fv
return map(lambda lst: "".join(lst), zip(*l)) if __name__ == "__main__": # hack to allow -i and -p to be alone if "-i" in sys.argv: sys.argv.insert(sys.argv.index('-i')+1, "True") parser = argparse.ArgumentParser(description='Classify a language') parser.add_argument('-m', dest='modelfile', help="model file", default=None) parser.add_argument('-i', dest="interactive", default="False", help='display an interactive loop', type=bool) args = parser.parse_args() fv = FeatureVector() if args.modelfile is not None: m = load_model(args.modelfile) fv._featuremap.readLexicon("words.lex") # so FeatureMap will be populated else: m = parse() if args.interactive: print "Enter a word: (q to quit)" user = raw_input() while(user != 'q'): fv = makeVector(user) fv_feats = fv.getFeatDict() p_label, p_acc, p_val = predict([None], [fv_feats], m)