def matching(text, command): list_ = [] # list can be road or bus stop belong to the command if command == 'road': path_road = 'D:/Project/Transportation_SMU-NEC_collaboration/Data' name_road = 'road_abbrevation_all.csv' list_ = load_file(path_road, name_road) elif command == 'busstop': path_stop = 'D:/Project/Transportation_SMU-NEC_collaboration/Data' name_busstop = 'bus_stop_crf.csv' # delete the header of file bus stop list_ = load_file(path_stop, name_busstop) list_element = [] for index in range(0, len(list_)): ele = list_[index].lower() split_road = ele.split(';') for road in split_road: if pattern_match(road, text) is True: list_element.append(road) break list_token_element = [] for element in list_element: split_ = element.split() for value in split_: list_token_element.append(value) return list_token_element
def load_dict(command): path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF/crf_features' if command == 'svc': name = 'dict_bussvc.txt' list_svc = load_file(path, name) return list_svc elif command == 'road': name = 'dict_road.txt' list_road = load_file(path, name) return list_road elif command == 'busstop': # only contains bus stop name name = 'dict_busstop.txt' list_stop = load_file(path, name) return list_stop elif command == 'busstopCode': # name = 'dict_busstopCode.txt' name = 'bus_stop.csv' all_stop = load_file(path, name) list_stop = list() for i in range(1, len(all_stop)): stop = all_stop[i].split('\t') list_stop.append(stop[0]) return list_stop return 'You need to give correct command'
def kapp_sentiment_event(path_sent, path_event, sentiment, event): load_sent, load_event = load_file(path_sent, 'allTweets_ver3_sentLabel_' + sentiment + '.csv')\ , load_file(path_event, event + '.csv') list_sent, list_gt = list(), list() for i in range(0, len(load_sent)): split_sent, split_gt = load_sent[i].split('\t'), load_event[i].split('\t') label, gt = int(split_sent[0]), int(split_gt[1]) list_sent.append(label), list_gt.append(gt) print 'Kappa score of ' + sentiment + ' and ' + event + ':' + '\t' + str(kappa(list_gt, list_sent))
def count_sentiment_event(path_sent, path_event, sentiment, event): load_sent, load_event = load_file(path_sent, 'allTweets_ver3_sentLabel_' + sentiment + '.csv')\ , load_file(path_event, event + '.csv') list_sent, list_gt = list(), list() for i in range(0, len(load_sent)): split_sent, split_gt = load_sent[i].split('\t'), load_event[i].split('\t') label, gt = int(split_sent[0]), int(split_gt[1]) list_sent.append(label), list_gt.append(gt) count = 0 for i in range(0, len(list_gt)): if (list_gt[i] == 1) and (list_sent[i] == 1): count += 1 print 'Count of ' + sentiment + ' and ' + event + ':' + '\t' + str(count)
def load_predLabel(command): if command == 'twitter': path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/twitter' name = 'pred_label_twitter.csv' list_pred = load_file(path, name) elif command == 'sgforums': path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/sgforums' name = 'pred_label_sgforums.csv' list_pred = load_file(path, name) elif command == 'facebook': path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/facebook' name = 'pred_label_facebook.csv' list_pred = load_file(path, name) return list_pred
def load_dic_token_bef(command): list_token_bef = [] if command == 'svc': path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF/crf_features' name = 'tok_bef_bussvc.txt' list_token_bef = load_file(path, name) elif command == 'road': path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF/crf_features' name = 'tok_bef_road.txt' list_token_bef = load_file(path, name) elif command == 'busstop': path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF/crf_features' name = 'tok_bef_busstop.txt' list_token_bef = load_file(path, name) return list_token_bef
def clf_one_one(path, events, numFold): events.append('none') for i in range(0, numFold): label_test, sent_test = testing_data(path, events, i, numFold) print len(label_test), len(sent_test) list_pred, list_join = list(), list() for j in range(0, len(events)): k = j + 1 for k in range(k, len(events)): # print events[j] + '_' + events[k] sent_train = load_file(path_, str(numFold) + 'Folds_' + events_[j] + '_' + events_[k] + '_training_' + str(i) + '.csv') print 'Running event: ', events[j] + '_' + events[k] + ':Fold_index_' + str(i) list_all = load_event_x_y(events_[j] + '_' + events_[k], sent_train, command='') X, Y = np.array(list_all[0]), np.array(list_all[1]) # clf = MultinomialNB() # clf = LinearSVC(C=1.0, random_state=0, class_weight='auto', max_iter=100000) clf = LogisticRegression(max_iter=50000, solver='liblinear', tol=0.000001, class_weight='auto') list_pred.append(clf_event_one_one(clf, X, Y, sent_test, j, k)) for m in range(0, len(list_pred)): if m == 0: list_join = list_pred[m] else: list_join = [str(x) + str(y) for x, y in zip(list_join, list_pred[m])] pred_label = find_max_label(list_join, events) list_matrix = confusion_matrix(pred_label, label_test) for row in list_matrix: line = '' for value in row: line = line + '\t' + str(value) print line.strip()
def road_structure(path, name): ## make the structure of roads and bus stops list_ = load_file(path, name) list_headRoute = [] list_all_routes = [] list_route_head = [] for line in list_: split = line.split("\t") if split[1] == "subhead2": list_headRoute.append(split[0] + "\t" + split[2]) if len(list_route_head) > 0: list_all_routes.append(list_route_head) list_route_head = [] elif split[1] == "route": list_route_head.append(split[2]) ## catch the last element of routes if len(list_route_head) > 0: list_all_routes.append(list_route_head) list_route_head = [] print(len(list_headRoute)) print(len(list_all_routes)) list_struct = [] for index in range(0, len(list_headRoute)): for route in list_all_routes[index]: list_struct.append(list_headRoute[index] + "\t" + route) print(list_headRoute[index] + "\t" + route) return list_struct
def graph_constructed(path, name): list_line = load_file(path, name) list_source_nodes = [] list_target_nodes = [] DG = nx.DiGraph() for line in list_line: split_line = line.strip().split('\t') target = split_line[0] source = split_line[1] type = split_line[2] weight = int(split_line[3]) if (source not in list_source_nodes): list_source_nodes.append(source) DG.add_node(source, type_ = type) if (target not in list_target_nodes): list_target_nodes.append(target) DG.add_node(source, type_ = 'target') DG.add_weighted_edges_from([(source, target, weight)]) DG.nodes(data=True) nx.write_graphml(DG,'d:/' + name.replace('.csv', '') + '.graphml')
def road_abb(path, name, list_all): list_road = load_file(path, name) list_abb = list_all[0] list_word = list_all[1] # for abb in list_all: # for road in list_road: # if (pattern_match(abb, road) == True): # new_road = road list_new_road = [] for road in list_road: # print (road) new_road = road.lower() for index in range(len(list_word)): word = list_word[index].lower() abb = list_abb[index].lower() if (pattern_match(word, new_road) == True): new_road = new_road.replace(word, abb) list_new_road.append(new_road) print (new_road) #print (len(list_new_road)) list_line = [] for index in range(0, len(list_new_road)): line = list_road[index].lower() + ";" + list_new_road[index] list_line.append(line) print (line)
def load_text_CRF_label(path, name, command): list_file = load_file(path, name) list_text, list_label, list_svc = list(), list(), list() for i in range(0, len(list_file)): if i % 2 == 0: text = list_file[i] list_text.append(text) else: label = list_file[i] list_label.append(label) for i in range(0, len(list_text)): svc, split_text, split_label = '', list_text[i].split(), list_label[i].split() for j in range(0, len(split_text)): token_label = split_label[j] # get the token for bus services if token_label == '1': token_text = filter_eachToken(split_text[j], command) svc += token_text + ' ' if len(svc) != 0: list_svc.append(svc.strip()) else: list_svc.append('None') list_all = list() list_all.append(list_text), list_all.append(list_label), list_all.append(list_svc) return list_all
def insert_sql_json(path, name, data): list_ = load_file(path, name) db = MySQLdb.connect(host='10.4.8.139', # your host, usually localhost user='******', # your username passwd='Bussense2016!', # your password db='bussense_archive') # name of the data base cur = db.cursor() if data == 'twitter': list_insert = list() for value in list_: split_value = value.split('\t') tweetID, screenName, createdAt, tweetText = split_value[0], split_value[1], split_value[2], split_value[3] list_insert.append((tweetID, screenName, createdAt, tweetText)) sql = "INSERT INTO tweets(tweetID, screenName, createdAt, tweetText) VALUE(%s,%s,%s,%s);" print sql elif data == 'facebook': list_insert = list() for value in list_: split_value = value.split('\t') postID, name, createdTime, post = split_value[0], split_value[1], split_value[2], split_value[3] list_insert.append((postID, name, createdTime.replace('+0000', ''), post)) sql = "INSERT INTO fb_posts(postID, name, createdTime, post) VALUE(%s,%s,%s,%s);" print sql cur.executemany(sql, list_insert) db.commit() db.close() print 'Finish insert data to sql'
def events_all(path, events): list_events, list_nones = list(), list() for event in events: list__ = load_file(path, event + '.csv') list_event, list_none = events_none(list__) list_events.append(list_event), list_nones.append(list_none) # print len(list_event) for i in range(0, len(list_nones)): if i == 0: first = list(set(list_nones[i]).intersection(list_nones[i + 1])) elif i == len(list_nones) - 1: break else: first = list(set(first).intersection(list_nones[i + 1])) list_none = first list_events.append(list_none) events.append('none') for event in list_events: print len(event) for i in range(0, len(list_events)): j = i + 1 for k in range(j, len(list_events)): first, second = list_events[i], list_events[k] second = convert_list_(second, events[k]) new_list = first + second print events[i], events[k] write_file(path, events[i] + '_' + events[k], new_list)
def retrieve_sents(path, name): sents_only = list() sents = load_file(path, '/allTweets_ver3/' + name) for sent in sents: split_sent = sent.split('\t') sents_only.append(split_sent[2]) print len(sents) return sents_only
def featuers_CRF(files, path): # reading all CRF features and add them to list list_all = [] for f in files: list_f = load_file(path, f) list_all.append(list_f) # print f return list_all
def load_data_ID(path, name): list_ = load_file(path, name) X_id = list() for line in list_: split_line = line.split('\t') id = split_line[0] X_id.append(id) return X_id
def testing_data(path, events, indexFold, numFold): labels, sents = list(), list() for i in range(0, len(events)): list_event = load_file(path, str(numFold) + 'Folds_' + events[i] + '_testing_' + str(indexFold) + '.csv') for line in list_event: split_line = line.split('\t') labels.append(str(i)), sents.append(split_line[2]) return labels, sents
def event_sentiment(path, event, ftr_list): path_event = path + '/allTweets_ver3' list_ = load_file(path_event, event + '.csv') new_list = list() for i in range(0, len(list_)): split_value = list_[i].split('\t') new_list.append(split_value[0] + '\t' + split_value[1] + '\t' + ftr_list[i]) write_file(path_event, event + '_sentiment', new_list)
def convert_traj(path, name): traj = load_file(path, name) convert = list() for i in xrange(1, len(traj)): split_i = traj[i].split() # print i - 1, split_i[0], split_i[1], -1 convert.append(str(i - 1) + ' ' + split_i[0] + ' ' + split_i[1] + ' ' + str(-1)) convert.append(str(-1)) return convert
def combine_mult_file(path, name, enum): list_files = list() for index in range(1, (enum + 1)): file = load_file(path, name + '_' + str(index) + '.csv') # file = load_file(path, name + str(index) + '.csv') list_files = list_files + file print index, len(file) print len(list_files) write_file(path, name, list_files)
def check_abb(path, name): list_ = load_file(path, name) list_check = [] for each in list_: split = each.split('\t') if (len(split) not in list_check): list_check.append(len(split)) print (list_check)
def loading_reg_ftr(list_line_, command): if command == 'sgforums': path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF/crf_features/features' elif command == 'twitter': path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLink' elif command == 'facebook': path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF/crf_features/features' else: print 'You need to give the correct command' quit() name_svc = 'ftr_reg_svc.csv' name_road = 'ftr_reg_match_road.csv' name_busstop = 'ftr_reg_match_busstop.csv' list_svc = convert_label_CRF(convert_list_CRF(load_file(path_, name_svc)), 'svc') list_road = convert_label_CRF(convert_list_CRF(load_file(path_, name_road)), 'road') list_busstop = convert_label_CRF(convert_list_CRF(load_file(path_, name_busstop)), 'busstop') CRF_F1_reg(list_line_, list_svc, list_road, list_busstop)
def road_service_street(path, name): list_ = load_file(path, name) list_service = [] for each in list_: split = each.split("\t") if split[2] not in list_service and split[1] == "subhead2": list_service.append(split[2]) print(list_service) print(len(list_service)) return list_service
def road_service(path, name): list_ = load_file(path, name) list_service = [] for each in list_: split = each.split("\t") if split[0] not in list_service: list_service.append(split[0]) print(list_service) print(len(list_service)) return list_service
def load_data(path, name, command): list_ = load_file(path, name) X = list() for line in list_: split_line = line.split('\t') text = split_line[3] if command == 'preprocessText': text = clean_url(text) X.append(text) return X
def construct_oldfeatures(path, files, path_write): for f in files: list_ = load_file(path, f) list_convert = list() for line in list_: string = '' for c in line: string += c + '\t' list_convert.append(string.strip()) print f write_file(path_write, f.replace('.csv', ''), list_convert)
def ftr_sentiment(path, name): ftr = list() sents = load_file(path, name) for sent in sents: split_sent = sent.split('\t') sentiment, sentence = split_sent[0], split_sent[1] new_sent = sentence.strip() + ' sentiment_' + str(sentiment) ftr.append(new_sent) # for value in ftr: # print value # print len(ftr) return ftr
def load_sentiment(path, name, sentiment_label): list_ = load_file(path, name) list_write = list() for value in list_: split_value = value.split('\t') sentiment, sentence = split_value[0], split_value[1] if sentiment_label == 'veryNeg': if int(sentiment) == 0: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'Neg': if int(sentiment) == 1: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'Neutral': if int(sentiment) == 2: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'Pos': if int(sentiment) == 3: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'veryPos': if int(sentiment) == 4: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'veryNeg_Neg': if int(sentiment) == 0 or int(sentiment) == 1: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence elif sentiment_label == 'Pos_veryPos': if int(sentiment) == 3 or int(sentiment) == 4: new_sent = '1' + '\t' + sentence else: new_sent = '0' + '\t' + sentence list_write.append(new_sent) print len(list_write) write_file(path, 'allTweets_ver3_sentLabel_' + sentiment_label, list_write)
def stop_dict(): path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data' name = 'bus_stop.csv' dictionary = load_file(path, name)[1:] all, codes, names = list(), list(), list() for word in dictionary: split_word = word.split('\t') code, name = split_word[0], split_word[1] codes.append(code), names.append(name) all.append(codes), all.append(names) # print len(codes), len(names), len(all) return all
def loading_target_CRFs(command): if command == 'twitter': # loading target labels path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF' name_ = 'labeling_all.txt' list_line_ = filterTxt_CRF(load_file(path_, name_), command='removeLink') elif command == 'sgforums': # loading target labels path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF' name_ = 'Label_all_crf.txt' list_line_ = load_file(path_, name_) elif command == 'facebook': # loading target labels path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF' name_ = 'label.txt' list_line_ = filterTxt_CRF(load_file(path_, name_), command='removePunc') Y = np.array(load_target_label(list_line_)) print 'Finish loading target label ' + command return Y