def matching(text, command):
    list_ = []  # list can be road or bus stop belong to the command
    if command == 'road':
        path_road = 'D:/Project/Transportation_SMU-NEC_collaboration/Data'
        name_road = 'road_abbrevation_all.csv'
        list_ = load_file(path_road, name_road)
    elif command == 'busstop':
        path_stop = 'D:/Project/Transportation_SMU-NEC_collaboration/Data'
        name_busstop = 'bus_stop_crf.csv'  # delete the header of file bus stop
        list_ = load_file(path_stop, name_busstop)

    list_element = []
    for index in range(0, len(list_)):
        ele = list_[index].lower()
        split_road = ele.split(';')

        for road in split_road:
            if pattern_match(road, text) is True:
                list_element.append(road)
                break

    list_token_element = []
    for element in list_element:
        split_ = element.split()
        for value in split_:
            list_token_element.append(value)
    return list_token_element
def load_dict(command):
    path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF/crf_features'
    if command == 'svc':
        name = 'dict_bussvc.txt'
        list_svc = load_file(path, name)
        return list_svc
    elif command == 'road':
        name = 'dict_road.txt'
        list_road = load_file(path, name)
        return list_road
    elif command == 'busstop':  # only contains bus stop name
        name = 'dict_busstop.txt'
        list_stop = load_file(path, name)
        return list_stop
    elif command == 'busstopCode':
        # name = 'dict_busstopCode.txt'
        name = 'bus_stop.csv'
        all_stop = load_file(path, name)
        list_stop = list()
        for i in range(1, len(all_stop)):
            stop = all_stop[i].split('\t')
            list_stop.append(stop[0])
        return list_stop

    return 'You need to give correct command'
def kapp_sentiment_event(path_sent, path_event, sentiment, event):
    load_sent, load_event = load_file(path_sent, 'allTweets_ver3_sentLabel_' + sentiment + '.csv')\
        , load_file(path_event, event + '.csv')
    list_sent, list_gt = list(), list()
    for i in range(0, len(load_sent)):
        split_sent, split_gt = load_sent[i].split('\t'), load_event[i].split('\t')
        label, gt = int(split_sent[0]), int(split_gt[1])
        list_sent.append(label), list_gt.append(gt)
    print 'Kappa score of ' + sentiment + ' and ' + event + ':' + '\t' + str(kappa(list_gt, list_sent))
def count_sentiment_event(path_sent, path_event, sentiment, event):
    load_sent, load_event = load_file(path_sent, 'allTweets_ver3_sentLabel_' + sentiment + '.csv')\
        , load_file(path_event, event + '.csv')
    list_sent, list_gt = list(), list()
    for i in range(0, len(load_sent)):
        split_sent, split_gt = load_sent[i].split('\t'), load_event[i].split('\t')
        label, gt = int(split_sent[0]), int(split_gt[1])
        list_sent.append(label), list_gt.append(gt)

    count = 0
    for i in range(0, len(list_gt)):
        if (list_gt[i] == 1) and (list_sent[i] == 1):
            count += 1
    print 'Count of ' + sentiment + ' and ' + event + ':' + '\t' + str(count)
def load_predLabel(command):
    if command == 'twitter':
        path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/twitter'
        name = 'pred_label_twitter.csv'
        list_pred = load_file(path, name)
    elif command == 'sgforums':
        path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/sgforums'
        name = 'pred_label_sgforums.csv'
        list_pred = load_file(path, name)
    elif command == 'facebook':
        path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/facebook'
        name = 'pred_label_facebook.csv'
        list_pred = load_file(path, name)

    return list_pred
def load_dic_token_bef(command):
    list_token_bef = []
    if command == 'svc':
        path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF/crf_features'
        name = 'tok_bef_bussvc.txt'
        list_token_bef = load_file(path, name)
    elif command == 'road':
        path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF/crf_features'
        name = 'tok_bef_road.txt'
        list_token_bef = load_file(path, name)
    elif command == 'busstop':
        path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF/crf_features'
        name = 'tok_bef_busstop.txt'
        list_token_bef = load_file(path, name)
    return list_token_bef
def clf_one_one(path, events, numFold):
    events.append('none')

    for i in range(0, numFold):
        label_test, sent_test = testing_data(path, events, i, numFold)
        print len(label_test), len(sent_test)
        list_pred, list_join = list(), list()
        for j in range(0, len(events)):
            k = j + 1
            for k in range(k, len(events)):
                # print events[j] + '_' + events[k]
                sent_train = load_file(path_, str(numFold) + 'Folds_' + events_[j] + '_' + events_[k]
                                       + '_training_' + str(i) + '.csv')
                print 'Running event: ', events[j] + '_' + events[k] + ':Fold_index_' + str(i)
                list_all = load_event_x_y(events_[j] + '_' + events_[k], sent_train, command='')
                X, Y = np.array(list_all[0]), np.array(list_all[1])
                # clf = MultinomialNB()
                # clf = LinearSVC(C=1.0, random_state=0, class_weight='auto', max_iter=100000)
                clf = LogisticRegression(max_iter=50000, solver='liblinear', tol=0.000001, class_weight='auto')
                list_pred.append(clf_event_one_one(clf, X, Y, sent_test, j, k))

        for m in range(0, len(list_pred)):
            if m == 0:
                list_join = list_pred[m]
            else:
                list_join = [str(x) + str(y) for x, y in zip(list_join, list_pred[m])]
        pred_label = find_max_label(list_join, events)
        list_matrix = confusion_matrix(pred_label, label_test)
        for row in list_matrix:
            line = ''
            for value in row:
                line = line + '\t' + str(value)
            print line.strip()
def road_structure(path, name):
    ## make the structure of roads and bus stops

    list_ = load_file(path, name)
    list_headRoute = []

    list_all_routes = []
    list_route_head = []
    for line in list_:
        split = line.split("\t")
        if split[1] == "subhead2":
            list_headRoute.append(split[0] + "\t" + split[2])

            if len(list_route_head) > 0:
                list_all_routes.append(list_route_head)
                list_route_head = []
        elif split[1] == "route":
            list_route_head.append(split[2])

    ## catch the last element of routes
    if len(list_route_head) > 0:
        list_all_routes.append(list_route_head)
        list_route_head = []

    print(len(list_headRoute))
    print(len(list_all_routes))

    list_struct = []
    for index in range(0, len(list_headRoute)):
        for route in list_all_routes[index]:
            list_struct.append(list_headRoute[index] + "\t" + route)
            print(list_headRoute[index] + "\t" + route)
    return list_struct
def graph_constructed(path, name):
    list_line = load_file(path, name)

    list_source_nodes = []
    list_target_nodes = []

    DG = nx.DiGraph()
    for line in list_line:
        split_line = line.strip().split('\t')
        target = split_line[0]
        source = split_line[1]
        type = split_line[2]
        weight = int(split_line[3])

        if (source not in list_source_nodes):
            list_source_nodes.append(source)
            DG.add_node(source, type_ = type)

        if (target not in list_target_nodes):
            list_target_nodes.append(target)
            DG.add_node(source, type_ = 'target')

        DG.add_weighted_edges_from([(source, target, weight)])

    DG.nodes(data=True)
    nx.write_graphml(DG,'d:/' + name.replace('.csv', '') + '.graphml')
def road_abb(path, name, list_all):
    list_road = load_file(path, name)
    
    list_abb = list_all[0]
    list_word = list_all[1]
    
#     for abb in list_all:
#         for road in list_road:
#             if (pattern_match(abb, road) == True):
#                 new_road = road
    list_new_road = []
    for road in list_road:
#         print (road)
        new_road = road.lower()
        for index in range(len(list_word)):
            word = list_word[index].lower()
            abb = list_abb[index].lower()
            
            if (pattern_match(word, new_road) == True):
                new_road = new_road.replace(word, abb)
        list_new_road.append(new_road)
        print (new_road)
    #print (len(list_new_road))
    
    list_line = []
    for index in range(0, len(list_new_road)):
        line = list_road[index].lower() + ";" + list_new_road[index]
        list_line.append(line)
        print (line)
def load_text_CRF_label(path, name, command):
    list_file = load_file(path, name)
    list_text, list_label, list_svc = list(), list(), list()
    for i in range(0, len(list_file)):
        if i % 2 == 0:
            text = list_file[i]
            list_text.append(text)
        else:
            label = list_file[i]
            list_label.append(label)

    for i in range(0, len(list_text)):
        svc, split_text, split_label = '', list_text[i].split(), list_label[i].split()
        for j in range(0, len(split_text)):
            token_label = split_label[j]
            # get the token for bus services
            if token_label == '1':
                token_text = filter_eachToken(split_text[j], command)
                svc += token_text + ' '

        if len(svc) != 0:
            list_svc.append(svc.strip())
        else:
            list_svc.append('None')

    list_all = list()
    list_all.append(list_text), list_all.append(list_label), list_all.append(list_svc)
    return list_all
def insert_sql_json(path, name, data):
    list_ = load_file(path, name)
    db = MySQLdb.connect(host='10.4.8.139',  # your host, usually localhost
                         user='******',  # your username
                          passwd='Bussense2016!',  # your password
                          db='bussense_archive')  # name of the data base
    cur = db.cursor()
    if data == 'twitter':
        list_insert = list()
        for value in list_:
            split_value = value.split('\t')
            tweetID, screenName, createdAt, tweetText = split_value[0], split_value[1], split_value[2], split_value[3]
            list_insert.append((tweetID, screenName, createdAt, tweetText))
        sql = "INSERT INTO tweets(tweetID, screenName, createdAt, tweetText) VALUE(%s,%s,%s,%s);"
        print sql
    elif data == 'facebook':
        list_insert = list()

        for value in list_:
            split_value = value.split('\t')
            postID, name, createdTime, post = split_value[0], split_value[1], split_value[2], split_value[3]
            list_insert.append((postID, name, createdTime.replace('+0000', ''), post))
        sql = "INSERT INTO fb_posts(postID, name, createdTime, post) VALUE(%s,%s,%s,%s);"
        print sql

    cur.executemany(sql, list_insert)
    db.commit()
    db.close()
    print 'Finish insert data to sql'
def events_all(path, events):
    list_events, list_nones = list(), list()
    for event in events:
        list__ = load_file(path, event + '.csv')
        list_event, list_none = events_none(list__)
        list_events.append(list_event), list_nones.append(list_none)
        # print len(list_event)

    for i in range(0, len(list_nones)):
        if i == 0:
            first = list(set(list_nones[i]).intersection(list_nones[i + 1]))
        elif i == len(list_nones) - 1:
            break
        else:
            first = list(set(first).intersection(list_nones[i + 1]))

    list_none = first
    list_events.append(list_none)
    events.append('none')

    for event in list_events:
        print len(event)

    for i in range(0, len(list_events)):
        j = i + 1
        for k in range(j, len(list_events)):
            first, second = list_events[i], list_events[k]
            second = convert_list_(second, events[k])
            new_list = first + second
            print events[i], events[k]
            write_file(path, events[i] + '_' + events[k], new_list)
def retrieve_sents(path, name):
    sents_only = list()
    sents = load_file(path, '/allTweets_ver3/' + name)
    for sent in sents:
        split_sent = sent.split('\t')
        sents_only.append(split_sent[2])
    print len(sents)
    return sents_only
def featuers_CRF(files, path):
    # reading all CRF features and add them to list
    list_all = []
    for f in files:
        list_f = load_file(path, f)
        list_all.append(list_f)
        # print f
    return list_all
def load_data_ID(path, name):
    list_ = load_file(path, name)
    X_id = list()
    for line in list_:
        split_line = line.split('\t')
        id = split_line[0]
        X_id.append(id)
    return X_id
def testing_data(path, events, indexFold, numFold):
    labels, sents = list(), list()
    for i in range(0, len(events)):
        list_event = load_file(path, str(numFold) + 'Folds_' + events[i] + '_testing_' + str(indexFold) + '.csv')
        for line in list_event:
            split_line = line.split('\t')
            labels.append(str(i)), sents.append(split_line[2])
    return labels, sents
def event_sentiment(path, event, ftr_list):
    path_event = path + '/allTweets_ver3'
    list_ = load_file(path_event,  event + '.csv')
    new_list = list()
    for i in range(0, len(list_)):
        split_value = list_[i].split('\t')
        new_list.append(split_value[0] + '\t' + split_value[1] + '\t' + ftr_list[i])

    write_file(path_event, event + '_sentiment', new_list)
def convert_traj(path, name):
    traj = load_file(path, name)
    convert = list()
    for i in xrange(1, len(traj)):
        split_i = traj[i].split()
        # print i - 1, split_i[0], split_i[1], -1
        convert.append(str(i - 1) + ' ' + split_i[0] + ' ' + split_i[1] + ' ' + str(-1))
    convert.append(str(-1))
    return convert
def combine_mult_file(path, name, enum):
    list_files = list()

    for index in range(1, (enum + 1)):
        file = load_file(path, name + '_' + str(index) + '.csv')
        # file = load_file(path, name + str(index) + '.csv')
        list_files = list_files + file
        print index, len(file)
    print len(list_files)
    write_file(path, name, list_files)
def check_abb(path, name):
    list_ = load_file(path, name)
    
    list_check = []
    for each in list_:
        split = each.split('\t')
        
        if (len(split) not in list_check):
            list_check.append(len(split))
    print (list_check)
def loading_reg_ftr(list_line_, command):
    if command == 'sgforums':
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF/crf_features/features'
    elif command == 'twitter':
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF/crf_features/features_rmLink'
    elif command == 'facebook':
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF/crf_features/features'
    else:
        print 'You need to give the correct command'
        quit()

    name_svc = 'ftr_reg_svc.csv'
    name_road = 'ftr_reg_match_road.csv'
    name_busstop = 'ftr_reg_match_busstop.csv'

    list_svc = convert_label_CRF(convert_list_CRF(load_file(path_, name_svc)), 'svc')
    list_road = convert_label_CRF(convert_list_CRF(load_file(path_, name_road)), 'road')
    list_busstop = convert_label_CRF(convert_list_CRF(load_file(path_, name_busstop)), 'busstop')
    CRF_F1_reg(list_line_, list_svc, list_road, list_busstop)
def road_service_street(path, name):
    list_ = load_file(path, name)
    list_service = []
    for each in list_:
        split = each.split("\t")
        if split[2] not in list_service and split[1] == "subhead2":
            list_service.append(split[2])

    print(list_service)
    print(len(list_service))
    return list_service
def road_service(path, name):
    list_ = load_file(path, name)
    list_service = []
    for each in list_:
        split = each.split("\t")
        if split[0] not in list_service:
            list_service.append(split[0])

    print(list_service)
    print(len(list_service))
    return list_service
def load_data(path, name, command):
    list_ = load_file(path, name)
    X = list()
    for line in list_:
        split_line = line.split('\t')
        text = split_line[3]

        if command == 'preprocessText':
            text = clean_url(text)

        X.append(text)
    return X
def construct_oldfeatures(path, files, path_write):
    for f in files:
        list_ = load_file(path, f)
        list_convert = list()

        for line in list_:
            string = ''
            for c in line:
                string += c + '\t'
            list_convert.append(string.strip())
        print f
        write_file(path_write, f.replace('.csv', ''), list_convert)
def ftr_sentiment(path, name):
    ftr = list()
    sents = load_file(path, name)
    for sent in sents:
        split_sent = sent.split('\t')
        sentiment, sentence = split_sent[0], split_sent[1]
        new_sent = sentence.strip() + ' sentiment_' + str(sentiment)
        ftr.append(new_sent)
    # for value in ftr:
    #     print value
    # print len(ftr)
    return ftr
def load_sentiment(path, name, sentiment_label):
    list_ = load_file(path, name)
    list_write = list()

    for value in list_:
        split_value = value.split('\t')
        sentiment, sentence = split_value[0], split_value[1]
        if sentiment_label == 'veryNeg':
            if int(sentiment) == 0:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'Neg':
            if int(sentiment) == 1:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'Neutral':
            if int(sentiment) == 2:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'Pos':
            if int(sentiment) == 3:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'veryPos':
            if int(sentiment) == 4:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'veryNeg_Neg':
            if int(sentiment) == 0 or int(sentiment) == 1:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'Pos_veryPos':
            if int(sentiment) == 3 or int(sentiment) == 4:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        list_write.append(new_sent)
    print len(list_write)
    write_file(path, 'allTweets_ver3_sentLabel_' + sentiment_label, list_write)
def stop_dict():
    path = 'D:/Project/Transportation_SMU-NEC_collaboration/Data'
    name = 'bus_stop.csv'

    dictionary = load_file(path, name)[1:]
    all, codes, names = list(), list(), list()
    for word in dictionary:
        split_word = word.split('\t')
        code, name = split_word[0], split_word[1]
        codes.append(code), names.append(name)
    all.append(codes), all.append(names)
    # print len(codes), len(names), len(all)
    return all
def loading_target_CRFs(command):
    if command == 'twitter':
        # loading target labels
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/twitter/labeling_CRF'
        name_ = 'labeling_all.txt'
        list_line_ = filterTxt_CRF(load_file(path_, name_), command='removeLink')

    elif command == 'sgforums':
        # loading target labels
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/sgforums/20152207_singaporebuses_all_posts/labeling_CRF'
        name_ = 'Label_all_crf.txt'
        list_line_ = load_file(path_, name_)

    elif command == 'facebook':
        # loading target labels
        path_ = 'D:/Project/Transportation_SMU-NEC_collaboration/Data/facebook/BusNews/labeling_CRF'
        name_ = 'label.txt'
        list_line_ = filterTxt_CRF(load_file(path_, name_), command='removePunc')

    Y = np.array(load_target_label(list_line_))
    print 'Finish loading target label ' + command
    return Y