def clf_one_one(path, events, numFold): events.append('none') for i in range(0, numFold): label_test, sent_test = testing_data(path, events, i, numFold) print len(label_test), len(sent_test) list_pred, list_join = list(), list() for j in range(0, len(events)): k = j + 1 for k in range(k, len(events)): # print events[j] + '_' + events[k] sent_train = load_file(path_, str(numFold) + 'Folds_' + events_[j] + '_' + events_[k] + '_training_' + str(i) + '.csv') print 'Running event: ', events[j] + '_' + events[k] + ':Fold_index_' + str(i) list_all = load_event_x_y(events_[j] + '_' + events_[k], sent_train, command='') X, Y = np.array(list_all[0]), np.array(list_all[1]) # clf = MultinomialNB() # clf = LinearSVC(C=1.0, random_state=0, class_weight='auto', max_iter=100000) clf = LogisticRegression(max_iter=50000, solver='liblinear', tol=0.000001, class_weight='auto') list_pred.append(clf_event_one_one(clf, X, Y, sent_test, j, k)) for m in range(0, len(list_pred)): if m == 0: list_join = list_pred[m] else: list_join = [str(x) + str(y) for x, y in zip(list_join, list_pred[m])] pred_label = find_max_label(list_join, events) list_matrix = confusion_matrix(pred_label, label_test) for row in list_matrix: line = '' for value in row: line = line + '\t' + str(value) print line.strip()
list_write.append(line) write_file(path, 'twitter_event_' + event, list_write) if __name__ == '__main__': path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_event/detectAllEvents/allTweets_ver3' events = ['wait', 'missing', 'skip', 'slow', 'accident', 'crowd'] path_load = 'C:/Users/vdthoang/Google Drive/LARC - NEC Project/icwsm2016/data' # name_load = 'twitter.csv' # X_ = load_data(path_load, name_load, 'preprocessText') # X_id = load_data_ID(path_load, name_load) X_id, X_ = load_data_ubicomp() # print len(X_) for event in events: list_sentences = load_file(path, event + '.csv') print 'Running event: ', event list_all = load_event_x_y(event, list_sentences, command='preprocessText') X_train, Y_train = np.array(list_all[0]), np.array(list_all[1]) # clf = LinearSVC(C=1.0, random_state=0, class_weight='auto', max_iter=100000) clf = LogisticRegression(max_iter=50000, solver='liblinear', tol=0.000001, class_weight='auto') X_pred = clf_event_predicted(X_, X_train, Y_train, clf) writing_pred(path_load, event, X_id, X_, X_pred)
clf.fit(X_convert_trans, Y) # training model y_pred = clf.predict(X_pred_trans) y_prob = clf.decision_function(X_pred_trans) max_prob, min_prob = max(y_prob), min(y_prob) list_write = list() for i in range(0, len(y_pred)): prob = (y_prob[i] - min_prob) / (max_prob - min_prob) print y_pred[i], prob, texts[i] # list_write.append(str(y_pred[i]) + '\t' + texts[i]) list_write.append(str(y_pred[i])) if command == 'twitter': path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/twitter/events_pred' write_file(path_write, event, list_write) if __name__ == '__main__': ################################################################################################ # TWITTER path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_event/detectAllEvents/allTweets_ver3' events = ['wait', 'missing', 'skip', 'slow', 'accident', 'crowd'] for event in events: list_sentences = load_file(path, event + '.csv') print 'Running event: ', event list_all = load_event_x_y(event, list_sentences, '') X, Y = np.array(list_all[0]), np.array(list_all[1]) clf = LinearSVC(C=1.0, random_state=0, class_weight='auto', max_iter=100000) event_pred_model(event, list_all[0], Y, command='twitter')
# Facebook # path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_event/detectAllEvents_ver2' # events_ = ['complaint', 'compliment', 'skip', 'suggestion', 'wait'] # events_all(path_, events_) # running classification for event # Sgforums # path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_classification_events/detectAllEvents_ver2' # events_ = ['bunch', 'crowd'] # Facebook path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_event/detectAllEvents_ver2' events_ = ['complaint', 'compliment', 'skip', 'suggestion', 'wait'] events_.append('none') for i in range(0, len(events_)): j = i + 1 for k in range(j, len(events_)): print events_[i], events_[k] list_sentences = load_file(path_, events_[i] + '_' + events_[k] + '.csv') print 'Running event: ', events_[i] + '_' + events_[k] list_all = load_event_x_y(events_[i] + '_' + events_[k], list_sentences, command='') X, Y = np.array(list_all[0]), np.array(list_all[1]) # clf = MultinomialNB() # clf = LinearSVC(C=1.0, random_state=0, class_weight='auto', max_iter=100000) clf = LogisticRegression(max_iter=50000, solver='liblinear', tol=0.000001, class_weight='auto') # clf_event_running(X, Y, clf, K=5, command='KFold') # clf_event_running(path, event, 'LR', X, Y, clf, K=5, command='StratifiedKFold', call='PrintPredicted') # clf_event_running(path, event, 'LR', X, Y, clf, K=5, command='StratifiedKFold', call='ProbScore') clf_event_running(path_, events_[i] + '_' + events_[k], 'LR', X, Y, clf, K=5, command='StratifiedKFold', call='')