def predict_third_set(gram_train, gram_test, y_label, scale=20000, max_iter=1, lambd=0.00001): gram_train = gram_train[0] + gram_train[1] + gram_train[2] gram_test = gram_test[0] + gram_test[1] + gram_test[2] krl = KRL(gram_m=gram_train / scale, max_iter=max_iter, lambd=lambd) krl.fit(np.array(y_label)) y_pred_krl = krl.predict(gram_test / scale) clf = SVM(gram_m=gram_train) clf.fit(np.array(y_label)) y_pred_svm = clf.predict(gram_test) y_pred = np.sign(y_pred_svm + y_pred_krl) return y_pred
def classify(messages_train,labels_train,messages_test,process_messages_train,process_messages_test,tokens_train,tokens_test,process_tokens_train,process_tokens_test,pos_tags_train,pos_tags_test,negationList,clusters,slangDictionary,lexicons,mpqa_lexicons): # 0 - negative messages # 1 - positives messages labels_train = [0 if x=="negative" else 1 for x in labels_train] #compute pos tag bigrams for all messages pos_bigrams_train = getBigrams(pos_tags_train) pos_bigrams_test = getBigrams(pos_tags_test) #compute pos tag trigrams for all messages pos_trigrams_train = getTrigrams(pos_tags_train) pos_trigrams_test = getTrigrams(pos_tags_test) #get the unique pos bigrams and trigrams from training set unique_pos_tags = getPosTagsSet(pos_tags_train) unique_bigrams = getBigramsSet(pos_bigrams_train) unique_trigrams= getTrigramsSet(pos_trigrams_train) #calculate pos bigrams score for all categories #both dictionaries will be used for training and testing (cannot create new for testing because we don't know the labels of the new messages) pos_tags_scores_negative = posTagsScore(unique_pos_tags,0,pos_tags_train,labels_train) pos_tags_scores_positive = posTagsScore(unique_pos_tags,1,pos_tags_train,labels_train) #calculate pos bigrams score for all categories #both dictionaries will be used for training and testing (cannot create new for testing because we don't know the labels of the new messages) pos_bigrams_scores_negative = posBigramsScore(unique_bigrams,0,pos_bigrams_train,labels_train) pos_bigrams_scores_positive = posBigramsScore(unique_bigrams,1,pos_bigrams_train,labels_train) #calculate pos bigrams score for all categories #both dictionaries will be used for training and testing (cannot create new for testing because we don't know the labels of the new messages) pos_trigrams_scores_negative = posTrigramsScore(unique_trigrams,0,pos_trigrams_train,labels_train) pos_trigrams_scores_positive = posTrigramsScore(unique_trigrams,1,pos_trigrams_train,labels_train) #assign a precision and F1 score to each word of to all mpqa lexicons mpqaScores = getScores(mpqa_lexicons,process_messages_train,labels_train) #get features from train messages features_train = features.getFeatures(messages_train,process_messages_train,tokens_train,process_tokens_train,pos_tags_train,slangDictionary,lexicons,mpqa_lexicons,pos_bigrams_train,pos_trigrams_train,pos_bigrams_scores_negative,pos_bigrams_scores_positive,pos_trigrams_scores_negative,pos_trigrams_scores_positive,pos_tags_scores_negative,pos_tags_scores_positive,mpqaScores,negationList,clusters) #regularize train features features_train=regularization.regularize(features_train) #get features from test messages features_test = features.getFeatures(messages_test,process_messages_test,tokens_test,process_tokens_test,pos_tags_test,slangDictionary,lexicons,mpqa_lexicons,pos_bigrams_test,pos_trigrams_test,pos_bigrams_scores_negative,pos_bigrams_scores_positive,pos_trigrams_scores_negative,pos_trigrams_scores_positive,pos_tags_scores_negative,pos_tags_scores_positive,mpqaScores,negationList,clusters) #regularize test features features_test=regularization.regularize(features_test) #feature selection #features_train, features_test = selection.feature_selection(features_train,labels_train,features_test,1150) #C parameter of SVM C = 0.001953125 #C = 19.3392161013 #train classifier and return trained model #model = LogisticRegression.train(features_train,labels_train) model = SVM.train(features_train,labels_train,c=C,k="linear") #predict labels #prediction = LogisticRegression.predict(features_test,model) prediction = SVM.predict(features_test,model) return prediction
make_submission_data(y_preds, 'nb_1214.csv') # run Perceptron ----------------------- perceptron = Perceptron(r=0.1, margin=0.01, n_epoch=20) perceptron.fit(data_train) print("Predict the TEST set") perceptron.predict(data_test, perceptron.weights[-1]) print("Predict the EVAL set") y_preds = perceptron.predict(data_eval, perceptron.weights[-1])['y_preds'] make_submission_data(y_preds, 'perceptron.csv') # run SVM svm = SVM(r=0.01, c=1, n_epoch=17) svm.fit(data_train) print("Predict the TEST set") svm.predict(data_test, svm.weights[-1]) print("Predict the EVAL set") y_preds = svm.predict(data_eval, svm.weights[-1])['y_preds'] make_submission_data(y_preds, 'svm.csv') # run Logistic ----------------------------- logistic = Logistic(r=0.01, sigma=100, n_epoch=10) logistic.fit(data_train) print("Predict the TEST set") logistic.predict(data_test, logistic.weights[-1]) print("Preidict the EVAL set") y_preds = logistic.predict(data_eval, logistic.weights[-1])['y_preds'] make_submission_data(y_preds, 'logistic.csv') ## run Bagging ------------------------------ # Cross validation