def stacking_cross_validate(raw_df, add_sentiment=True): cv = KFold(n_splits=10, random_state=90051, shuffle=True) score = 0 for train_index, test_index in cv.split(raw_df): train_df, test_df = raw_df.iloc[train_index].reset_index( drop=True), raw_df.iloc[test_index].reset_index(drop=True) y_train, y_test = train_df['ID'], test_df['ID'] sgd_model = SGDClassifier(loss='hinge', penalty="l2", max_iter=10000, n_jobs=-1, tol=1e-6) svm_model = svm.LinearSVC(C=0.68, max_iter=1000, tol=1e-6) train_1, test_1 = predict(svm_model, train_df, test_df, wordngram=[1], pos=True, posngram=[1], addsentiment=True, min_tf_idf=1) train_2, test_2 = predict(sgd_model, train_df, test_df, wordngram=[2], pos=False, posngram=[1], addsentiment=True, min_tf_idf=1) train_3, test_3 = predict(sgd_model, train_df, test_df, wordngram=[1], pos=True, posngram=[1,1000], addsentiment=True, min_tf_idf=1) h_model = svm.LinearSVC(C=0.9, max_iter=1000) X_train, X_test = [], [] for i in range(0, len(train_1)): X_train.append(str(train_1[i]) + ' ' + str(train_2[i]) + ' ' + str(train_3[i])) for i in range(0, len(test_1)): X_test.append(str(test_1[i]) + ' ' + str(test_2[i]) + ' ' + str(test_3[i])) X_train = np.array(X_train) X_test = np.array(X_test) stop_words = stopwords.words('english') cv = CountVectorizer(max_df=0.57, stop_words=stop_words, decode_error='ignore') trian_wc_vec = cv.fit_transform(X_train) test_wc_vec = cv.transform(X_test) # get tfidf transformer = TfidfTransformer(smooth_idf=True, use_idf=True) X_train = transformer.fit_transform(trian_wc_vec) X_test = transformer.transform(test_wc_vec) h_model.fit(X_train, y_train) train_acc = accuracy_score(h_model.predict(X_train), y_train) predicted_labels = h_model.predict(X_test) acc = accuracy_score(predicted_labels, y_test) sub_acc_1, sub_acc_2, sub_acc_3 = accuracy_score(train_1, y_train), accuracy_score(train_2, y_train), accuracy_score(train_3, y_train) #print("####INFO train error: ", train_acc, sub_acc_1, sub_acc_2, sub_acc_3) sub_acc_1, sub_acc_2, sub_acc_3 = accuracy_score(test_1, y_test), accuracy_score(test_2, y_test), accuracy_score(test_3, y_test) print("####INFO test error: ", acc, sub_acc_1, sub_acc_2, sub_acc_3) # uncomment to print miss labeled data # for i in range(0, len(predicted_labels)): # if predicted_labels[i] != y_test[i]: # print("#" + str(i) + "; T: " + str(y_test[i]) + "; F: " + str(predicted_labels[i]) + "; Text: " + test_df.loc[i,'Text']) score += acc avg_acc = score / 10 print("####INFO: trainning", 'Stacking', avg_acc)