Ejemplo n.º 1
0
def _bow_svm(train_bow_file_name, train_label_file_name, test_bow_file_name,
             test_label_file_name):
    print 'loading file ...'
    train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file(train_bow_file_name, False)
    test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file(test_bow_file_name, False)
    print num_words, 'words'
    idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list, num_words)
    print idfs

    print 'to sparse ...'
    train_cm = _word_cnts_to_bow_vecs(train_word_indices_list, train_word_cnts_list, num_words, idfs)
    test_cm = _word_cnts_to_bow_vecs(test_word_indices_list, test_word_cnts_list, num_words, idfs)
    # print train_cm[0]

    train_y = ioutils.load_labels_file(train_label_file_name)
    test_y = ioutils.load_labels_file(test_label_file_name)

    print 'training svm ...'
    # clf = svm.SVC(decision_function_shape='ovo')
    clf = svm.LinearSVC()
    clf.fit(train_cm, train_y)
    print 'done.'

    y_pred = clf.predict(test_cm)
    ftmp = open('e:/dc/20ng_data/tmp_labels.txt', 'wb')
    for i in xrange(len(y_pred)):
        ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n')
    ftmp.close()
    print 'accuracy', accuracy_score(test_y, y_pred)
    print 'precision', precision_score(test_y, y_pred, average='macro')
    print 'recall', recall_score(test_y, y_pred, average='macro')
    print 'f1', f1_score(test_y, y_pred, average='macro')
Ejemplo n.º 2
0
def __bow_svm(train_bow_file_name, train_label_file_name, test_bow_file_name,
              test_label_file_name):
    print 'loading file ...'
    uint16_cnts = True
    train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file(
        train_bow_file_name, uint16_cnts)
    test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file(
        test_bow_file_name, uint16_cnts)
    print num_words, 'words'
    idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list,
                           num_words)
    print idfs

    train_y = ioutils.load_labels_file(train_label_file_name)
    # print train_y[:100]
    print Counter(train_y)
    test_y = ioutils.load_labels_file(test_label_file_name)
    # print test_y[:100]
    print Counter(test_y)

    print 'to sparse ...'
    train_cm = _word_cnts_to_bow_vecs(train_word_indices_list,
                                      train_word_cnts_list, num_words, idfs)
    test_cm = _word_cnts_to_bow_vecs(test_word_indices_list,
                                     test_word_cnts_list, num_words, idfs)
    # print train_cm[0]

    print 'training svm ...'
    # clf = svm.SVC(decision_function_shape='ovo', kernel='poly', degree=2)
    clf = svm.SVC(decision_function_shape='ovo', kernel='linear')
    # clf = svm.LinearSVC()
    clf.fit(train_cm, train_y)
    print 'done.'

    y_pred = clf.predict(test_cm)
    print y_pred[:100]
    print Counter(y_pred)
    # ftmp = open('e:/data/emadr/20ng_data/tmp_labels.txt', 'wb')
    # for i in xrange(len(y_pred)):
    #     ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n')
    # ftmp.close()
    # print 'accuracy', accuracy_score(test_y, y_pred)
    # print 'precision', precision_score(test_y, y_pred, average='macro')
    # print 'recall', recall_score(test_y, y_pred, average='macro')
    # print 'f1', f1_score(test_y, y_pred, average='macro')

    acc = accuracy_score(test_y, y_pred)
    prec = precision_score(test_y, y_pred, average='macro')
    recall = recall_score(test_y, y_pred, average='macro')
    f1 = f1_score(test_y, y_pred, average='macro')
    print 'accuracy', acc
    print 'precision', prec
    print 'recall', recall
    print 'macro f1', f1
    print '%f\t%f\t%f\t%f' % (acc, prec, recall, f1)

    return acc, prec, recall, f1
Ejemplo n.º 3
0
def __bow_lr(train_bow_file_name, train_label_file_name, test_bow_file_name,
             test_label_file_name):
    print 'loading file ...'
    uint16_cnts = True
    train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file(
        train_bow_file_name, uint16_cnts)
    test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file(
        test_bow_file_name, uint16_cnts)
    print num_words, 'words'
    idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list,
                           num_words)
    print idfs

    train_y = ioutils.load_labels_file(train_label_file_name)
    # print train_y[:100]
    print Counter(train_y)
    test_y = ioutils.load_labels_file(test_label_file_name)
    # print test_y[:100]
    print Counter(test_y)

    print 'to sparse ...'
    train_cm = _word_cnts_to_bow_vecs(train_word_indices_list,
                                      train_word_cnts_list, num_words, idfs)
    test_cm = _word_cnts_to_bow_vecs(test_word_indices_list,
                                     test_word_cnts_list, num_words, idfs)
    # print train_cm[0]

    print 'training lr ...'
    lr = LogisticRegression(C=1000,
                            multi_class='multinomial',
                            solver='newton-cg')
    lr.fit(train_cm, train_y)
    print 'done.'

    y_pred = lr.predict(test_cm)
    # print y_pred[:100]
    print Counter(y_pred)
    # ftmp = open('e:/data/emadr/20ng_data/tmp_labels.txt', 'wb')
    # for i in xrange(len(y_pred)):
    #     ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n')
    # ftmp.close()

    acc = accuracy_score(test_y, y_pred)
    prec = precision_score(test_y, y_pred, average='macro')
    recall = recall_score(test_y, y_pred, average='macro')
    f1 = f1_score(test_y, y_pred, average='macro')
    print 'accuracy', acc
    print 'precision', prec
    print 'recall', recall
    print 'macro f1', f1
    print '%f\t%f\t%f\t%f' % (acc, prec, recall, f1)
Ejemplo n.º 4
0
def __get_bow_vecs(dw_file):
    print 'loading file ...'
    word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file(dw_file)
    print num_words, 'words'

    idfs = _get_idf_values(word_indices_list, word_cnts_list, num_words)

    print 'to sparse ...'
    bow_vecs = _word_cnts_to_bow_vecs(word_indices_list, word_cnts_list, num_words, idfs)
    return bow_vecs
Ejemplo n.º 5
0
def __get_bow_vecs(dw_file):
    print 'loading file ...'
    word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file(
        dw_file)
    print num_words, 'words'

    idfs = _get_idf_values(word_indices_list, word_cnts_list, num_words)

    print 'to sparse ...'
    bow_vecs = _word_cnts_to_bow_vecs(word_indices_list, word_cnts_list,
                                      num_words, idfs)
    return bow_vecs
Ejemplo n.º 6
0
def close_words_of_docs():
    word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt'
    words = ioutils.load_words_dict_to_list(word_dict_file_name)

    bow_docs_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin'
    word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file(bow_docs_file_name)
    print num_words, 'words'

    doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/doc_vec_cpp_100.bin'
    word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_cpp_100.bin'
    doc_vecs = ioutils.load_vec_list_file(doc_vec_file_name)
    word_vecs = ioutils.load_vec_list_file(word_vec_file_name)

    def show_close_words(doc_vec, word_indices):
        dist_list = list()
        for word_idx in word_indices:
            dist_list.append((np.dot(doc_vec, word_vecs[word_idx]), word_idx))
        dist_list.sort(key=lambda tup: tup[0])
        # closest_words = heapq.nlargest(10, dist_list, key=lambda tup: tup[0])
        for dist, idx in dist_list:
            print dist, words[idx]

    show_close_words(doc_vecs[0], word_indices_list[0])
Ejemplo n.º 7
0
def close_words_of_docs():
    word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt'
    words = ioutils.load_words_dict_to_list(word_dict_file_name)

    bow_docs_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin'
    word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file(
        bow_docs_file_name)
    print num_words, 'words'

    doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/doc_vec_cpp_100.bin'
    word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_cpp_100.bin'
    doc_vecs = ioutils.load_vec_list_file(doc_vec_file_name)
    word_vecs = ioutils.load_vec_list_file(word_vec_file_name)

    def show_close_words(doc_vec, word_indices):
        dist_list = list()
        for word_idx in word_indices:
            dist_list.append((np.dot(doc_vec, word_vecs[word_idx]), word_idx))
        dist_list.sort(key=lambda tup: tup[0])
        # closest_words = heapq.nlargest(10, dist_list, key=lambda tup: tup[0])
        for dist, idx in dist_list:
            print dist, words[idx]

    show_close_words(doc_vecs[0], word_indices_list[0])