Exemple #1
0
 def getGlobalFeatureVector(self):
     globalFV = FeatureVector()
     curState = self
     while curState.localFV != None:
         globalFV.add(curState.localFV)
         curState = curState.prevState
     return globalFV
 def getFeatureVector(self, features_list):
     """
     :param features_list:list(string)
     :return:FeatureVector
     """
     fv = FeatureVector()
     for feature in features_list:
         if feature in self.featureIndexer.ObjectToIndex:
             fv.add(self.featureIndexer.ObjectToIndex[feature])
     return fv
Exemple #3
0
def main():
	print ("SVM Approach")
	print ("Generating messages ...")
	feature_vector = FeatureVector(SMS_COLLECTION)
	feature_vector.data_process(sep='\t')
	messages = feature_vector.messages

	print "Splitting into train and cross-validation sets ..."
	msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2)
	print len(msg_train), len(msg_test), len(msg_train) + len(msg_test)
	print msg_train.shape, msg_test.shape

	print "Creating Pipeline for the analyzing and training ..."
	pipeline = Pipeline([
	    ('bow', CountVectorizer(analyzer=split_into_lemmas)),  # strings to token integer counts
	    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
	    ('classifier', SVC()),  # train on TF-IDF vectors w/ Naive Bayes classifier
	])

	# pipeline parameters to automatically explore and tune
	param_svm = [
	  {'classifier__C': [1, 10, 100, 1000], 'classifier__kernel': ['linear']},
	  {'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']},
	]
	print("pipeline:", [name for name, _ in pipeline.steps])
	for name, v in pipeline.steps:
		print name
		print v

	grid_svm = GridSearchCV(
	    pipeline,  	# pipeline from above
	    param_grid=param_svm,  # parameters to tune via cross validation
	    refit=True,  # fit using all data, on the best detected classifier
	    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
	    scoring='accuracy',  # what score are we optimizing?
	    cv=StratifiedKFold(label_train, n_folds=5),  # what type of cross validation to use
	)
	svm_detector = grid_svm.fit(msg_train, label_train) # find the best combination from param_svm

	print "\nScores for various cases ..."
	for i in xrange(len(svm_detector.grid_scores_)):
		print svm_detector.grid_scores_[i]


	curve = plot_learning_curve(pipeline, "accuracy vs. training set size", msg_train, label_train, cv=5)
	curve.savefig("./plots/acc-vs-trainSize_SVM.png")
	pipeline.fit(msg_train, label_train)  #trained here

	print "Score in 20% of test dataset"
	test_predictions = svm_detector.predict(msg_test)
	print 'accuracy', accuracy_score(label_test, test_predictions)
	print 'confusion matrix\n', confusion_matrix(label_test, test_predictions)
	print '(row=expected, col=predicted)'
	print classification_report(label_test, test_predictions)
Exemple #4
0
    def Task(self, trainType, inputQueue, resultQueue):
        ''' parallelising function
        '''
        while 1:
            inputOne = inputQueue.get()
            # terminal signal
            if inputOne is None:
                break
            # update model paraments
            if inputOne[0] is None:
                self.model.setParam(inputOne[1])
                continue
            # train decode
            if inputOne[1] is None:
                states = self.decodeBeamSearch(inputOne[0], trainType)
            # evaluate decode
            else:
                states = self.decodeBeamSearch(inputOne[1], "test")
                resultQueue.put((inputOne[0], states[1].getFinalResult()))
                continue
            gradient = FeatureVector()
            if trainType == 'MIRA':
                K = 0 # number of candidates
                for kk in xrange(1, len(states)):
                    if states[kk] != None:
                        K += 1
                    else:
                        break
                b = [0.0 for kk in xrange(K)]
                lam_dist = [0.0 for kk in xrange(K)]
                dist = [FeatureVector() for kk in xrange(K)]

                goldFV = states[0].getGlobalFeatureVector()
                for kk in xrange(K):
                    # the score difference between 
                    # gold-standard tree and auto tree
                    lam_dist[kk] = (states[0].getScore()
                                - states[kk+1].getScore())
                    b[kk] = self.loss(states[0], states[kk+1])
                    b[kk] -= lam_dist[kk]
                    #the FV difference
                    dist[kk] = FeatureVector.getDistVector(goldFV,
                                states[kk+1].getGlobalFeatureVector())

                alpha = QPSolver.hildreth(dist, b)
                for kk in xrange(K):
                    gradient.add(dist[kk], alpha[kk])
            else:
                if not states[1].IsGold():
                    gradient.add(states[0].getGlobalFeatureVector())
                    gradient.subtract(states[1].getGlobalFeatureVector())
            resultQueue.put(gradient)
def main():
    print("DecisionTree Approach")
    print("Generating messages ...")
    feature_vector = FeatureVector(SMS_COLLECTION)
    feature_vector.data_process(sep='\t')
    messages = feature_vector.messages

    print "Splitting into train and cross-validation sets ..."
    msg_train, msg_test, label_train, label_test = train_test_split(
        messages['message'], messages['label'], test_size=0.2)
    print len(msg_train), len(msg_test), len(msg_train) + len(msg_test)
    print msg_train.shape, msg_test.shape

    print "\nCreating Pipeline for the analyzing and training ..."
    dt_old = Pipeline([
        ('bow', CountVectorizer(
            analyzer=split_into_lemmas)),  # strings to token integer counts
        ('tfidf',
         TfidfTransformer()),  # integer counts to weighted TF-IDF scores
        ('classifier',
         DecisionTreeClassifier(min_samples_split=20, random_state=99)
         ),  # train on TF-IDF vectors w/ DecisionTree classifier
    ])
    print("pipeline:", [name for name, _ in dt_old.steps])
    print("-- 10-fold cross-validation , without any grid search")
    dt_old.fit(msg_train, label_train)
    scores = cross_val_score(dt_old, msg_train, label_train, cv=10)
    print "mean: {:.3f} (std: {:.3f})".format(scores.mean(), scores.std())

    from sklearn.externals.six import StringIO
    import pydot

    dot_data = StringIO()
    classes = ["ham", "spam"]
    vocab = dt_old.named_steps['bow'].get_feature_names()
    vocab1 = [v.encode('ascii', 'ignore') for v in vocab]
    # print "vocab: ", vocab1
    with open("./plots/heme.dot", "w") as f:
        export_graphviz(dt_old.named_steps['classifier'],
                        out_file=f,
                        max_depth=13,
                        feature_names=vocab1)
    print("Creating a visualization of decision tree")
    # graph = pydot.graph_from_dot_data(dot_data.getvalue())
    # graph.write_pdf("./plots/heme.pdf")

    print "\nScore in 20% of test dataset"
    test_predictions = dt_old.predict(msg_test)
    print 'accuracy', accuracy_score(label_test, test_predictions)
    print 'confusion matrix\n', confusion_matrix(label_test, test_predictions)
    print '(row=expected, col=predicted)'
    print classification_report(label_test, test_predictions)
def makeVector(w, lang=None):
    fv = FeatureVector()
    
    for ind, c in enumerate(w):
        fv.add(str(ind) + "-" + c)
    for ind, f in enumerate(splitWord(w,2)):
        fv.add(str(ind) + "-" + f)
    for ind, f in enumerate(splitWord(w, 3)):
        fv.add(str(ind) + "-" + f)
    if lang is not None:
        fv.setLabel(lang)
        
    return fv
def main():
	feature_vector = FeatureVector(SMS_COLLECTION)
	feature_vector.data_process(sep='\t')
	messages = feature_vector.messages
	feature_vector.transformer()
	bow_transformer = feature_vector.bow_transformer
	messages_bow = feature_vector.messages_bow

	print "Describing the messages ..."
	print messages.groupby('label').describe()
	print 'sparse matrix shape:', messages_bow.shape
	print 'number of non-zeros:', messages_bow.nnz
	print 'sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))


	print "TF_IDF normalization ... "
	tfidf_transformer = TfidfTransformer().fit(messages_bow)
	messages_tfidf = tfidf_transformer.transform(messages_bow)
	print "transform all ham/spam ... ", messages_tfidf.shape
	spam_detector = MultinomialNB().fit(messages_tfidf, messages['label'])
	all_predictions = spam_detector.predict(messages_tfidf)
	print 'accuracy', accuracy_score(messages['label'], all_predictions)
	print 'confusion matrix\n', confusion_matrix(messages['label'], all_predictions)
	print '(row=expected, col=predicted)'
	print classification_report(messages['label'], all_predictions)

	test_bow_transformer(messages, bow_transformer, tfidf_transformer, spam_detector)
Exemple #8
0
def main():
    print("Naive-Bayes Approach")
    print "Generating messages ..."
    feature_vector = FeatureVector(SMS_COLLECTION)
    feature_vector.data_process(sep='\t')
    messages = feature_vector.messages

    print "Splitting into train and cross-validation sets ..."
    msg_train, msg_test, label_train, label_test = train_test_split(
        messages['message'], messages['label'], test_size=0.2)
    print len(msg_train), len(msg_test), len(msg_train) + len(msg_test)
    print msg_train.shape, msg_test.shape

    print "Creating Pipeline for the analyzing and training ..."
    pipeline = Pipeline([
        ('bow', CountVectorizer(
            analyzer=split_into_lemmas)),  # strings to token integer counts
        ('tfidf',
         TfidfTransformer()),  # integer counts to weighted TF-IDF scores
        ('classifier',
         MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
    ])
    print(pipeline)
    curve = plot_learning_curve(pipeline,
                                "accuracy vs. training set size",
                                msg_train,
                                label_train,
                                cv=5)
    curve.savefig("./plots/acc-vs-trainSize_naive.png")
    pipeline.fit(msg_train, label_train)  #trained here

    print "Score in 20% of test dataset"
    test_predictions = pipeline.predict(msg_test)
    print 'accuracy', accuracy_score(label_test, test_predictions)
    print 'confusion matrix\n', confusion_matrix(label_test, test_predictions)
    print '(row=expected, col=predicted)'
    print classification_report(label_test, test_predictions)
Exemple #9
0
def train_sarsaLApprox_agent(n_iters, lam, record_history=False):

    # Create feature vector
    agent_features = [
        range(1, 7),
        range(4, 10),
        range(7, 13),
        range(10, 16),
        range(13, 19),
        range(16, 22)
    ]
    dealer_features = [range(1, 5), range(4, 8), range(7, 11)]

    # Must pass agent features first since agent hand is first in state
    agent_feature_vector = FeatureVector(agent_features, dealer_features)

    # initialise sarsa agent
    sarsa_approx_agent = sarsaLApprox(agent_feature_vector,
                                      lam,
                                      gamma=1,
                                      n0=10)

    # Train agent
    for i in range(n_iters):
        # initialise the environment
        card_table = Environment()

        sarsa_approx_agent.init_etrace()
        sarsa_approx_agent.init_etrace_log()

        # game ends when terminal state is reached
        while card_table.is_state_terminal == False:
            s = card_table.state

            # agent takes action, gets reward
            a = sarsa_approx_agent.choose_action(s)

            s_, r = card_table.step(a)

            sarsa_approx_agent.update_value_function(s, a, r, s_)

        if record_history:
            sarsa_approx_agent.log_weights()

    # Return the trained agent
    return sarsa_approx_agent
Exemple #10
0
 def trainForStructureLinearModel(self, trainSet, devSet, Iter, miniSize,
     numThreads, trainType, evaluateWhileTraining):
     ''' Main training function
     Args:
         trainSet: SentenceReader, train set in the form of SentenceReader
         devSet: SentenceReader, dev set in the form of SentenceReader
         Iter: count of iteration
         miniSize: int, num of train samples per thread #useless
         numThreads: int, num of threads
         trainType: str, MIRA or Standard or ...
         evaluateWhileTraining: bool, true for eval while training
     Returns:
         None
     Raise:
         None
     '''
     bestAccuracy = 0.0
     bestIter = 0
     bestParams = None
     
     trainSet.reset()
     sentences = []
     while trainSet.hasNext():
         sentences.append(trainSet.next())
     # number of sentences for each batch
     num = miniSize * numThreads
     # number of batch for each iteration
     batchSize = int(math.ceil(1.0*len(sentences)/num))
     print('Iterate %d times, '
     'the batch size for each iteration is %d'%(Iter, batchSize))
     # build multi-process pool
     resultQueue = multiprocessing.Queue()
     inputQueue = multiprocessing.Queue()
     workerPool = []
     for k in xrange(numThreads):
         worker = multiprocessing.Process(target=self.Task,
                             args=(trainType, inputQueue, resultQueue))
         worker.daemon = True
         workerPool.append(worker)
     for worker in workerPool:
         worker.start()
     # train iteration
     for it in xrange(Iter):
         print "Iteration %d\n Batch:"%it
         startTime = time.time()
         random.shuffle(sentences)
         for i in xrange(batchSize):
             # send model paraments
             for worker in workerPool:
                 inputQueue.put([None, self.model.getParam()])
             # send sentences
             start = num * i
             end = start + num
             end = min(end, len(sentences))
             if i%10 == 0:
                 print i,
                 sys.stdout.flush()
             for k in xrange(start, end):
                 inputQueue.put([sentences[k], None])
             # calculate gradient
             gradient = FeatureVector()
             factor = 1.0/(end - start)
             # parse result
             for k in xrange(end - start):
                 gradient.add(resultQueue.get(), factor)
             avg_upd = 1.0 * Iter * batchSize - (batchSize*(it-1)+(i+1)) + 1
             # avg_upd = 1.0
             self.model.perceptronUpdate(gradient, avg_upd)
         # batch iter end
         print '\nTrain Time: %f'%(time.time() - startTime)
         # evaluate and update model paraments
         if evaluateWhileTraining:
             startTime = time.time()
             averageParams = self.model.averageParam()
             # update model paraments
             for worker in workerPool:
                 inputQueue.put([None, averageParams])
             # evaluate averageParams
             accuracy = self.evaluate(devSet, numThreads, miniSize, inputQueue, resultQueue)
             print 'Dev Acc is %f'%accuracy
             if accuracy >= bestAccuracy:
                 bestIter = it
                 bestAccuracy = accuracy
                 bestParams = averageParams
             print 'Eval time: %f'%(time.time() - startTime)
     # train iter end
     for k in xrange(numThreads):
         inputQueue.put(None)
     for worker in workerPool:
         worker.join()
     if bestParams:
         self.model.setParam(bestParams)
     print 'The best iteration is %d'%bestIter
def parse(line):
    parts = line.split('#')[0].strip().split(' ')
    rel = int(parts[0])
    qid = int(parts[1].split(':')[1])
    return FeatureVector(rel, qid, parts[2:])
Exemple #12
0
 def hildreth(fv, loss):
     '''迭代求解alpha参数数组
     Args:
         fv:特征向量数组,是两个向量之间差分的结果
         loss:损失分数数组,是一个数据集每个特征向量和最佳特征向量的评分损失值
     Return:
         alpha:一组参数
     Raise:
         None
     '''
     LengthOfb = loss.__len__()
     alpha = [0.0] * LengthOfb
     F = [0.0] * LengthOfb
     kkt = [0.0] * LengthOfb
     # GramMatrix用于缓存向量数组的內积,用于降低时间复杂度
     K = fv.__len__()
     GramMatrix = [[0] * K for i in range(K)]
     is_computed = [False] * K
     for i in range(K):
         GramMatrix[i][i] = FeatureVector().dotProduct(fv[i], fv[i])
     # 寻找loss数组中最大数组项及其索引
     max_kkt = float("-inf")
     max_kkt_i = -1
     for i in range(LengthOfb):
         F[i] = loss[i]
         kkt[i] = F[i]
         if kkt[i] > max_kkt:
             max_kkt = kkt[i]
             max_kkt_i = i
     circle = 0
     diff_alpha = 0.0
     try_alpha = 0.0
     add_alpha = 0.0
     while max_kkt >= QPSolver.EPS and circle < QPSolver.MAX_ITER:
         # 更新loss最大项的alpha值
         diff_alpha = F[max_kkt_i] / GramMatrix[max_kkt_i][max_kkt_i]
         if GramMatrix[max_kkt_i][max_kkt_i] <= QPSolver.ZERO:
             diff_alpha = 0.0
         try_alpha = alpha[max_kkt_i] + diff_alpha
         if try_alpha < 0.0:
             add_alpha = -1.0 * alpha[max_kkt_i]
         else:
             add_alpha = diff_alpha
         alpha[max_kkt_i] += add_alpha
         # 提前计算好所用的向量內积
         if not is_computed[max_kkt_i]:
             for i in range(K):
                 GramMatrix[i][max_kkt_i] = FeatureVector().dotProduct(
                     fv[i], fv[max_kkt_i])
                 is_computed[max_kkt_i] = True
         for i in range(LengthOfb):
             F[i] -= add_alpha * GramMatrix[i][max_kkt_i]
             kkt[i] = F[i]
             if alpha[i] > QPSolver.ZERO:
                 kkt[i] = abs(F[i])
         # 每次迭代都处理loss最大项
         max_kkt = float("-inf")
         max_kkt_i = -1
         for i in range(LengthOfb):
             if kkt[i] > max_kkt:
                 max_kkt = kkt[i]
                 max_kkt_i = i
         circle += 1
     return alpha
def makeVector(w, lang):
    fv = FeatureVector()
    for f in splitWord(w,2):
        fv.add(f)
    fv.setLabel(lang)
    return fv
    return map(lambda lst: "".join(lst), zip(*l))

    
if __name__ == "__main__":

    # hack to allow -i and -p to be alone
    if "-i" in sys.argv:
        sys.argv.insert(sys.argv.index('-i')+1, "True")

    parser = argparse.ArgumentParser(description='Classify a language')
    parser.add_argument('-m', dest='modelfile', help="model file", default=None)
    parser.add_argument('-i', dest="interactive", default="False", help='display an interactive loop', type=bool)
    
    args = parser.parse_args()

    fv = FeatureVector()

    if args.modelfile is not None:
        m = load_model(args.modelfile)
        fv._featuremap.readLexicon("words.lex") # so FeatureMap will be populated
    else:
        m = parse()

    if args.interactive:
        print "Enter a word: (q to quit)"
        user = raw_input()
        while(user != 'q'):
            fv = makeVector(user)
            fv_feats = fv.getFeatDict()

            p_label, p_acc, p_val = predict([None], [fv_feats], m)