def event_pred_model(event, X, Y, command):  # note that X is a list and Y is a array
    texts = load_demo_text(command)
    total_X = X + texts
    X_convert, X_pred, total_X_convert = np.array(X), np.array(texts), np.array(total_X)

    MIN_DF = 2
    vec = CountVectorizer(lowercase=True, min_df=MIN_DF)
    vec = vec.fit(total_X_convert)

    X_convert_trans, X_pred_trans = vec.transform(X_convert), vec.transform(X_pred)

    clf.fit(X_convert_trans, Y)  # training model
    y_pred = clf.predict(X_pred_trans)
    y_prob = clf.decision_function(X_pred_trans)

    max_prob, min_prob = max(y_prob), min(y_prob)
    list_write = list()
    for i in range(0, len(y_pred)):
        prob = (y_prob[i] - min_prob) / (max_prob - min_prob)
        print y_pred[i], prob, texts[i]

        # list_write.append(str(y_pred[i]) + '\t' + texts[i])
        list_write.append(str(y_pred[i]))

    if command == 'twitter':
        path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/twitter/events_pred'
        write_file(path_write, event, list_write)
def matching_eventText(X, Y, event, call):
    pred = list()
    for value in X:
        if event in value:
            pred.append('1')
        else:
            pred.append('0')

    matrix = confusion_matrix(pred, Y)
    for value in matrix:
        line = ''
        for each in value:
            line = line + str(each) + '\t'
        print line.strip()
    print '----------------'

    list_print = list()
    if call == 'PrintPredicted':
        for index in range(0, len(pred)):
            tweet, pred_value, truth_value = X[index], pred[index], Y[index]
            list_ = list()
            list_.append(index), list_.append(pred_value), list_.append(truth_value), list_.append(tweet)
            list_print.append(list_)

        list_print = sorted(list_print, key=itemgetter(0))  # sorted list based on index
        list_write = list()
        for value in list_print:
            print str(value[0]) + '\t' + str(value[1]) + '\t' + str(value[2]) + '\t' + str(value[3])
            list_write.append(str(value[0]) + '\t' + str(value[1]) + '\t' + str(value[2]) + '\t' + str(value[3]))
        write_file(path, event + '_match', list_write)
def writing_pred(path, event, X_id, X, X_pred):
    list_write = list()
    for i in range(0, len(X_id)):
        line = X_id[i] + '\t' + X_pred[i] + '\t' + X[i]
        list_write.append(line)

    write_file(path, 'twitter_event_' + event, list_write)
Esempio n. 4
0
def distinct_list(list_, path, name):
    new_list = list()
    for value in list_:
        if value not in new_list:
            new_list.append(value)
    print len(new_list)
    write_file(path, name.replace('.txt', '') + '_new', new_list)
def events_all(path, events):
    list_events, list_nones = list(), list()
    for event in events:
        list__ = load_file(path, event + '.csv')
        list_event, list_none = events_none(list__)
        list_events.append(list_event), list_nones.append(list_none)
        # print len(list_event)

    for i in range(0, len(list_nones)):
        if i == 0:
            first = list(set(list_nones[i]).intersection(list_nones[i + 1]))
        elif i == len(list_nones) - 1:
            break
        else:
            first = list(set(first).intersection(list_nones[i + 1]))

    list_none = first
    list_events.append(list_none)
    events.append('none')

    for event in list_events:
        print len(event)

    for i in range(0, len(list_events)):
        j = i + 1
        for k in range(j, len(list_events)):
            first, second = list_events[i], list_events[k]
            second = convert_list_(second, events[k])
            new_list = first + second
            print events[i], events[k]
            write_file(path, events[i] + '_' + events[k], new_list)
def wordVec_facebook(sents, path_w, name_w, win_size):
    list_all = list()
    for i in range(0, len(sents)):
        split_sent = sents[i].split()
        tokens = list()
        for token in split_sent:
            token_filter = filter_eachTok_rmLinks(token, 'model')
            if len(token_filter) > 0:
                tokens.append(token_filter.lower())
        print i
        list_all.append(tokens)

    model = gensim.models.Word2Vec(list_all, size=win_size, window=5, min_count=1, workers=5)
    print model.most_similar(['bus'])

    list_write = list()
    for i in range(0, len(model.index2word)):
        # print model.index2word[i], model.syn0norm[i]
        line = model.index2word[i]
        for value in model.syn0norm[i]:
            line += '\t' + str(value)
        line = line.strip()
        list_write.append(line)
        print line
    write_file(path_w, name_w + '_%i' % win_size, list_write)
def construct_ftr_wordVector(dict_w, vec_w, lines, path_write, name_write):
    word_lines = list()
    for i in xrange(0, len(lines), 3):
        split_line = lines[i].lower().split('\t')
        word_lines.append(split_line)

    nftr_wordVec = len(vec_w[0])  # number of features in word vector
    for nfr in xrange(nftr_wordVec):
        frt_wordVec = list()
        for i in xrange(0, len(word_lines)):
            wordvec_score = ''
            w_line = word_lines[i]
            for j in xrange(0, len(w_line)):
                word = w_line[j]
                if word in dict_w:
                    index_ = dict_w.index(word)
                    scores_ = vec_w[index_]
                    ftr_score = scores_[nfr]
                    # print index_, word, ftr_score
                    wordvec_score += ftr_score + '\t'
                else:
                    word = '@' + word
                    if word in dict_w:
                        index_ = dict_w.index(word)
                        scores_ = vec_w[index_]
                        ftr_score = scores_[nfr]
                        # print index_, word, ftr_score
                        wordvec_score += ftr_score + '\t'
                    else:
                        # print word, '0'
                        wordvec_score += '0' + '\t'
            frt_wordVec.append(wordvec_score)
        # print len(frt_wordVec)
        # all_ftrWordVec.append(frt_wordVec)
        write_file(path_write, name_write + '_%i' % nfr, frt_wordVec)
def pattern_services(path, name, list_bus_services, list_posts_checkBusServices):
    list_write = []
    with open(path + '/' + name) as f:        
        for line in f:
            split_line = line.split('\t')
            if (name == 'posts_filter_v2.csv'):
                list_pattern_services = pattern_bus_service(split_line[1], list_bus_services)
                list_match_services = []
                if (split_line[0] in list_posts_checkBusServices):
                    list_match_services = match_bus_service(split_line[1], list_bus_services)
            else:
                list_pattern_services = pattern_bus_service(split_line[2], list_bus_services)
                list_match_services = []
                if (split_line[1] in list_posts_checkBusServices):
                    list_match_services = match_bus_service(split_line[2], list_bus_services)
                
            list_total = list(set(list_pattern_services) | set(list_match_services))
            
            #print (split_line[1] + '\t' + str(len(list_total)))
            
            if (len(list_total) != 0):
                for each in list_total:
                    if (name == 'posts_filter_v2.csv'):
                        print (split_line[0] + '\t' + each)
                        list_write.append(split_line[0] + '\t' + each)
                    else:
                        print (split_line[1] + '\t' + each)
                        list_write.append(split_line[1] + '\t' + each)
                    
    write_file(path, 'posts_busService', list_write)
    return list_write
def groupedEvents(path_, list_lbl, events, names, number, command):
    list_sents = get_sentence(list_lbl, number)
    for index in range(0, len(events)):
        event, name = events[index], names[index]
        list_lbl_event = give_label_sents_groupEvent(list_sents, event, name, command)
        write_file(path_, name, list_lbl_event)
        print name, len(list_lbl_event)
def originial_token(path, name, original_texts, filtering_texts, labels, command):
    texts_correct, labels_correct = list(), list()
    for index in range(0, len(original_texts)):
        text_org, text_fil, label = original_texts[index], filtering_texts[index], labels[index]
        split_textOrg, split_textFil, split_textLabel = text_org.split(), text_fil.split('\t'), label.split('\t')

        k = 0  # index of text labels
        line_correct, label_correct = '', ''
        for j in range(0, len(split_textOrg)):
            flag = check_token(split_textOrg[j], command)
            if flag is True:
                line_correct += split_textOrg[j] + ' '
                label_correct += '0 '
            else:
                line_correct += split_textOrg[j] + ' '
                if split_textLabel[k] == '1':
                    flag_int = RepresentsInt(filter_eachToken(split_textOrg[j], command))
                    if flag_int is True:
                        label_correct += split_textLabel[k] + ' '
                    else:
                        label_correct += '0 '
                else:
                    label_correct += split_textLabel[k] + ' '
                k += 1
        texts_correct.append(line_correct.strip()), labels_correct.append(label_correct.strip())

    list_write = list()
    for i in range(0, len(texts_correct)):
        list_write.append(texts_correct[i])
        list_write.append(labels_correct[i])
        # list_write.append('\n')

    write_file(path, name + '_' + command, list_write)
Esempio n. 11
0
def add_hour_dof():
    file = 'C:/Users/vdthoang/Google Drive/LARC - NEC Project/icwsm2016/data/tweet_short_event_tagged_for_icwsm2016.json'

    start = time.time()
    df = pd.read_json(file)
    end = time.time()
    print end - start

    df['hour'] = df['createAtMilis'].map(lambda x: (pd.to_datetime(x, unit='ms').hour + 8) % 24)
    df['dow'] = df['createAtMilis'].map(lambda x: pd.to_datetime(x, unit='ms').dayofweek)
    # df['woy'] = df['createAtMilis'].map(lambda x: pd.to_datetime(x, unit='ms').weekofyear)

    list_id = df['id']
    list_hour = df['hour']
    list_dw = df['dow']


    list_write = list()
    for i in range(0, len(list_dw)):
        print str(list_id[i]) + '\t' + str(list_hour[i]) + '\t' + str(list_dw[i])
        list_write.append(str(list_id[i]) + '\t' + str(list_hour[i]) + '\t' + str(list_dw[i]))

    path_write = 'C:/Users/vdthoang/Google Drive/LARC - NEC Project/icwsm2016/data'
    name_write = 'twitter_hour_dow'
    write_file(path_write, name_write, list_write)
def pattern_plate(path, name):
    list_write = []
    with open(path + '/' + name) as f:
        i = 0
        for line in f:
            if (i != 0):
                split_line = line.split('\t')
                list_busPlate = pattern_busPlate(check_busPlate(split_line[1]))
                if (len(list_busPlate) != 0):
                    for each in list_busPlate:
                        list_write.append(split_line[0] + '\t' + each)
                    
                    #print (split_line[0] + '\t' + str(len(pattern_busPlate(split_line[1]))))
            i += 1
            
    for value in list_write:
        print (value)
        
    # write_file(path, 'posts_busPlate', list_write)
    # write_file(path, 'posts_busPlate_v2', list_write)
    # write_file(path, 'tweet_2015_filtering_busPlate', list_write)
    # write_file(path, 'facebook_2015_filtering_busPlate', list_write)
    # write_file(path, 'facebook_2015_BusTransport_filtering_busPlate', list_write)
    write_file(path, 'facebook_2015_BusNews_filtering_busPlate', list_write)
        
    print (len(list_write))
Esempio n. 13
0
def combine_text_event(path_write, list_text, list_event, name_write):
    print len(list_text), len(list_event)
    list_all = list()
    for i in range(0, len(list_text)):
        combine = list_text[i] + '\t' + list_event[i]
        list_all.append(combine)

    write_file(path_write, name_write, list_all)
Esempio n. 14
0
def event_sentiment(path, event, ftr_list):
    path_event = path + '/allTweets_ver3'
    list_ = load_file(path_event,  event + '.csv')
    new_list = list()
    for i in range(0, len(list_)):
        split_value = list_[i].split('\t')
        new_list.append(split_value[0] + '\t' + split_value[1] + '\t' + ftr_list[i])

    write_file(path_event, event + '_sentiment', new_list)
def write_file_training(totalFold, numFold, training, path, events):
    for i in range(0, len(training)):
        j = i + 1
        for k in range(j, len(training)):
            first, second = training[i], training[k]
            second = convert_list_(second, events[k])
            new_list = first + second
            print events[i], events[k]
            write_file(path, str(totalFold) + 'Folds_' + events[i] + '_' + events[k] + '_training_' + str(numFold), new_list)
def write_pred_event(path, id_, text_, pred, event_1, event_2):
    list_write = list()
    for i in range(0, len(id_)):
        line = ''
        if int(pred[i]) == 0:
            line = id_[i] + '\t' + text_[i] + '\t' + event_2
        else:
            line = id_[i] + '\t' + text_[i] + '\t' + event_1
        list_write.append(line)
    write_file(path, 'pred_' + event_1 + '_' + event_2, list_write)
Esempio n. 17
0
def combine_mult_file(path, name, enum):
    list_files = list()

    for index in range(1, (enum + 1)):
        file = load_file(path, name + '_' + str(index) + '.csv')
        # file = load_file(path, name + str(index) + '.csv')
        list_files = list_files + file
        print index, len(file)
    print len(list_files)
    write_file(path, name, list_files)
def extract_road_busstop_expression(list_line, list_dict):
    y_label = []
    y_reg = []
    list_svc = []
    cnt = 1

    list_write = []
    for line in list_line:
        split_line = line.split('\t')

        index = split_line[0]
        label = split_line[1].strip()
        y_label.append(label)
        svc = split_line[2].strip()
        list_svc.append(svc)
        text = split_line[3].strip().lower()  # this is a text for road or bus stop
        # print index, label, svc

        list_road_match = []
        for index in range(0, len(list_dict)):
            road = list_road[index]
            split_road = road.split(';')
            for token in split_road:
                if pattern_match(token.lower(), text) is True:
                    split_token = token.split()
                    for value in split_token:
                        if value not in list_road_match:
                            list_road_match.append(value.lower())
                    break

        flag = 'FALSE'
        if svc in list_road_match:
            flag = 'TRUE'
            y_reg.append(flag)
        else:
            flag = 'FALSE'
            y_reg.append(flag)

        print '-- finished this line -- %i' % cnt + '\t' + flag
        list_write.append('-- finished this line -- %i' % cnt + '\t' + flag)
        cnt += 1
        break

    # for value in y_reg:
    #     print value

    # for i in range(0, len(y_reg)):
    #     if y_label[i] != y_reg[i]:
    #         print list_svc[i]

    write_file('d:/', 'busstop', list_write)

    print metrics.accuracy_score(y_label, y_reg)
    print metrics.classification_report(y_label, y_reg)
    print metrics.confusion_matrix(y_label, y_reg)
Esempio n. 19
0
def stemming_stopWords_text(list_, path, name):
    new_list = list()
    for i in range(0, len(list_)):
        line = list_[i]
        split_line = line.split('\t')
        event, label, text = split_line[0], split_line[1], split_line[2]
        new_text = stemming_text(remove_stopWords(text)).strip()
        new_line = event + '\t' + label + '\t' + new_text
        new_list.append(new_line)
        print i
    write_file(path, name + '_stemming_removeStop', new_list)
Esempio n. 20
0
def subset_tweetID(list_icwsm, list_time):
    list_ = extract_tweet(list_icwsm)
    list_time = extract_tweet(list_time)

    list_union = set(list_) & set(list_time)
    print len(list_union)

    for value in list_union:
        print value

    write_file('C:/Users/vdthoang/Google Drive/LARC - NEC Project/icwsm2016/data', 'twitter_correct', list_union)
def construct_oldfeatures(path, files, path_write):
    for f in files:
        list_ = load_file(path, f)
        list_convert = list()

        for line in list_:
            string = ''
            for c in line:
                string += c + '\t'
            list_convert.append(string.strip())
        print f
        write_file(path_write, f.replace('.csv', ''), list_convert)
Esempio n. 22
0
def filter_data(path, name):
    # clean text, first and last character if they are punctuation in string

    list_write = []
    with open(path + "/" + name) as f:
        for line in f:

            split_line = line.split("\t")
            print(split_line[0] + "\t" + filter_token(split_line[1]))
            list_write.append(split_line[0] + "\t" + filter_token(split_line[1]))

    write_file(path, "posts_filter", list_write)  # extract texts and write it on csv file
def load_sentiment(path, name, sentiment_label):
    list_ = load_file(path, name)
    list_write = list()

    for value in list_:
        split_value = value.split('\t')
        sentiment, sentence = split_value[0], split_value[1]
        if sentiment_label == 'veryNeg':
            if int(sentiment) == 0:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'Neg':
            if int(sentiment) == 1:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'Neutral':
            if int(sentiment) == 2:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'Pos':
            if int(sentiment) == 3:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'veryPos':
            if int(sentiment) == 4:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'veryNeg_Neg':
            if int(sentiment) == 0 or int(sentiment) == 1:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        elif sentiment_label == 'Pos_veryPos':
            if int(sentiment) == 3 or int(sentiment) == 4:
                new_sent = '1' + '\t' + sentence
            else:
                new_sent = '0' + '\t' + sentence

        list_write.append(new_sent)
    print len(list_write)
    write_file(path, 'allTweets_ver3_sentLabel_' + sentiment_label, list_write)
def bus_stop(path, name):
    #used to extract information from bus stop
    with open(path + '/' + name) as data_file:    
        data = json.load(data_file)
    
#     print (data)
    list_stop = []
    for stop in data:
        no = stop['no']
        name = stop['name']
        list_stop.append(str(no) + '\t' + name)
        print (str(no) + '\t' + name)
    print (len(list_stop))
    
    write_file(path, 'bus_stop', list_stop) #extract texts and write it on csv file
def convert_CRF_pred(path_write, name_write, list_pred):
    list_write = list()

    for j in range(0, len(list_pred)):
        # if j == 6:
        #     print 'hello'
        label = list_pred[j]
        convert_label = ''
        split_label = label.split('\t')
        for i in range(0, len(split_label)):
            token = split_label[i]
            if len(split_label) > 1:
                if i == 0:
                    if (token == '2') or (token == '3'):
                        next_token = split_label[i + 1]
                        if (next_token == '0') or (next_token == '1'):
                            convert_label += '0' + '\t'
                        else:
                            convert_label += token + '\t'
                    else:
                        convert_label += token + '\t'
                else:
                    if i == (len(split_label) - 1):
                        if (token == '2') or (token == '3'):
                            prev_token = split_label[i - 1]
                            if (prev_token == '0') or (prev_token == '1'):
                                convert_label += '0' + '\t'
                            else:
                                convert_label += token + '\t'
                        else:
                            convert_label += token + '\t'

                    else:
                        if (token == '2') or (token == '3'):
                            prev_token, next_token = split_label[i - 1], split_label[i + 1]
                            if ((prev_token == '0') or (prev_token == '1')) \
                                    and ((next_token == '0') or (next_token == '1')):
                                convert_label += '0' + '\t'
                            else:
                                convert_label += token + '\t'
                        else:
                            convert_label += token + '\t'
            else:
                convert_label += '0' + '\t'
        list_write.append(convert_label.strip())

    write_file(path_write, name_write, list_write)
Esempio n. 26
0
def split_sentence_CRF(list_post, path_write, name):
    list_sent_split, list_sent_origin = list(), list()
    for i in range(0, len(list_post)):
        post = list_post[i].replace('"', "")
        split_post, sentence = post.split(), ""

        for value in split_post:
            sentence = sentence + value + "\t"
        # print sentence.strip()
        # print post
        # print '\n'
        list_sent_split.append(sentence.strip()), list_sent_split.append("\n")
        list_sent_origin.append(post), list_sent_origin.append("\n")
    print len(list_sent_split), len(list_sent_origin)
    write_file(path_write, name + "_CRF", list_sent_split)
    write_file(path_write, name + "_origin", list_sent_origin)
    return None
def detectEvent(path, name, name_write, list_event):
    loadText = load_file(path, name)
    port = PorterStemmer()

    list_write = []
    for text in loadText:
        split_text = text.strip().split('\t')
        if (len(split_text) == 2):
            print text
            events = eventRecg(port, split_text[1].strip().lower(), list_event)

            if (len(events) > 0):
                print split_text[0], '\t', events
                for event in events:
                    list_write.append(split_text[0] + '\t' + event)

    write_file(path, name_write, list_write)
def road_extract(path, name, list_road, list_road_original):
    list_extract = []
    cnt = 0
    with open(path + '/' + name) as f:
        for line in f:
            cnt += 1
            split_line = line.split('\t')
            list_index = match_road(split_line[1].lower(), list_road)  # make the text is lowercase
            if len(list_index) > 0:
                for index in list_index:
                    print (split_line[0] + '\t' + list_road_original[index])
                    list_extract.append(split_line[0] + '\t' + list_road_original[index])
            print (cnt)

    # write_file(path, 'posts_roads.csv', list_extract)
    # write_file(path, 'tweet_2015_filtering_roads.csv', list_extract)
    write_file(path, 'facebook_2015_BusNews_filtering_roads.csv', list_extract)
def bus_stop_services(path, name):
    #used to extract information from bus service stop
    with open(path + '/' + name) as data_file:    
        data = json.load(data_file)
    
#     print (data)
    list_write = []
    for service in data:
        #print (each)
        list_service = data[str(service)]
#         print (str(service) + '\t' + str(list_service))
        
        for value in list_service:
            print (str(service) + '\t' + str(value))
            list_write.append(str(service) + '\t' + str(value))
            
    write_file(path, 'bus_stop_service', list_write) #extract texts and write it on csv file
Esempio n. 30
0
def filtering_json_facebook_ver2(path, name, name_write):
    with open(path + '/' + name) as data_file:
        data = json.load(data_file)
    cnt, list_write = 0, list()
    for element in data:
        from_data = element['from']

        print (element['id']
               + '\t' + from_data['name'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()
               + '\t' + element['created_time']
               + '\t' + element['message'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip())
        list_write.append(element['id']
               + '\t' + from_data['name'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()
               + '\t' + element['created_time']
               + '\t' + element['message'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip())
        cnt += 1
    print cnt
    write_file(path, name_write, list_write)