def __doc_classification(classifier,
                         train_vec_file,
                         train_label_file,
                         test_vec_file,
                         test_label_file,
                         vec_beg=0,
                         vec_end=-1):
    train_x = load_features(train_vec_file)
    train_y = load_labels_file(train_label_file)

    test_x = load_features(test_vec_file)
    test_y = load_labels_file(test_label_file)
    # print train_y[1000:1100]
    # print test_y[1000:1100]

    print train_x[0][50:60]

    if vec_beg != 0 or vec_end != -1:
        __truncate_vecs(train_x, vec_beg, vec_end)
        __truncate_vecs(test_x, vec_beg, vec_end)

    print 'training model ...'
    classifier.fit(train_x, train_y)
    print 'done.'

    y_pred = classifier.predict(test_x)
    get_scores(test_y, y_pred)
Exemple #2
0
def __emadr_vs_pvdbow():
    datadir = 'e:/data/emadr/20ng_bydate'
    doc_paths_file = os.path.join(datadir, 'all_doc_path_list.txt')
    dataset_split_labels_file = os.path.join(
        datadir, 'bindata/dataset-split-labels.bin')
    y_true_file = os.path.join(datadir, 'bindata/test-labels.bin')
    y_pred_pvdbow_file = os.path.join(datadir, 'bindata/ypred-pvdbow.bin')
    y_pred_emadr_file = os.path.join(datadir, 'bindata/ypred-emadr.bin')
    dst_file = os.path.join(datadir, 'example-candidates.txt')

    test_doc_paths = __get_test_doc_paths(doc_paths_file,
                                          dataset_split_labels_file)

    y_true = ioutils.load_labels_file(y_true_file)
    y_pvdbow = ioutils.load_labels_file(y_pred_pvdbow_file)
    y_emadr = ioutils.load_labels_file(y_pred_emadr_file)
    fout = open(dst_file, 'wb')
    cnt = 0
    for i, tup in enumerate(izip(y_true, y_pvdbow, y_emadr)):
        yt, y0, y1 = tup
        if yt != y0 and yt == y1:
            print test_doc_paths[i], doc_classes[yt], doc_classes[y0]
            doc_path = os.path.join(datadir, test_doc_paths[i][20:])
            fout.write('%s\t%s\t%s\n' %
                       (doc_path, doc_classes[yt], doc_classes[y0]))
            doc_text = __get_doc_text(doc_path)
            fout.write('%s\n' % doc_text)
            cnt += 1
    print cnt, len(y_true), float(cnt) / len(y_true)
    fout.close()
Exemple #3
0
def _bow_svm(train_bow_file_name, train_label_file_name, test_bow_file_name,
             test_label_file_name):
    print 'loading file ...'
    train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file(train_bow_file_name, False)
    test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file(test_bow_file_name, False)
    print num_words, 'words'
    idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list, num_words)
    print idfs

    print 'to sparse ...'
    train_cm = _word_cnts_to_bow_vecs(train_word_indices_list, train_word_cnts_list, num_words, idfs)
    test_cm = _word_cnts_to_bow_vecs(test_word_indices_list, test_word_cnts_list, num_words, idfs)
    # print train_cm[0]

    train_y = ioutils.load_labels_file(train_label_file_name)
    test_y = ioutils.load_labels_file(test_label_file_name)

    print 'training svm ...'
    # clf = svm.SVC(decision_function_shape='ovo')
    clf = svm.LinearSVC()
    clf.fit(train_cm, train_y)
    print 'done.'

    y_pred = clf.predict(test_cm)
    ftmp = open('e:/dc/20ng_data/tmp_labels.txt', 'wb')
    for i in xrange(len(y_pred)):
        ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n')
    ftmp.close()
    print 'accuracy', accuracy_score(test_y, y_pred)
    print 'precision', precision_score(test_y, y_pred, average='macro')
    print 'recall', recall_score(test_y, y_pred, average='macro')
    print 'f1', f1_score(test_y, y_pred, average='macro')
Exemple #4
0
def __bow_svm(train_bow_file_name, train_label_file_name, test_bow_file_name,
              test_label_file_name):
    print 'loading file ...'
    uint16_cnts = True
    train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file(
        train_bow_file_name, uint16_cnts)
    test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file(
        test_bow_file_name, uint16_cnts)
    print num_words, 'words'
    idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list,
                           num_words)
    print idfs

    train_y = ioutils.load_labels_file(train_label_file_name)
    # print train_y[:100]
    print Counter(train_y)
    test_y = ioutils.load_labels_file(test_label_file_name)
    # print test_y[:100]
    print Counter(test_y)

    print 'to sparse ...'
    train_cm = _word_cnts_to_bow_vecs(train_word_indices_list,
                                      train_word_cnts_list, num_words, idfs)
    test_cm = _word_cnts_to_bow_vecs(test_word_indices_list,
                                     test_word_cnts_list, num_words, idfs)
    # print train_cm[0]

    print 'training svm ...'
    # clf = svm.SVC(decision_function_shape='ovo', kernel='poly', degree=2)
    clf = svm.SVC(decision_function_shape='ovo', kernel='linear')
    # clf = svm.LinearSVC()
    clf.fit(train_cm, train_y)
    print 'done.'

    y_pred = clf.predict(test_cm)
    print y_pred[:100]
    print Counter(y_pred)
    # ftmp = open('e:/data/emadr/20ng_data/tmp_labels.txt', 'wb')
    # for i in xrange(len(y_pred)):
    #     ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n')
    # ftmp.close()
    # print 'accuracy', accuracy_score(test_y, y_pred)
    # print 'precision', precision_score(test_y, y_pred, average='macro')
    # print 'recall', recall_score(test_y, y_pred, average='macro')
    # print 'f1', f1_score(test_y, y_pred, average='macro')

    acc = accuracy_score(test_y, y_pred)
    prec = precision_score(test_y, y_pred, average='macro')
    recall = recall_score(test_y, y_pred, average='macro')
    f1 = f1_score(test_y, y_pred, average='macro')
    print 'accuracy', acc
    print 'precision', prec
    print 'recall', recall
    print 'macro f1', f1
    print '%f\t%f\t%f\t%f' % (acc, prec, recall, f1)

    return acc, prec, recall, f1
Exemple #5
0
def __bow_lr(train_bow_file_name, train_label_file_name, test_bow_file_name,
             test_label_file_name):
    print 'loading file ...'
    uint16_cnts = True
    train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file(
        train_bow_file_name, uint16_cnts)
    test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file(
        test_bow_file_name, uint16_cnts)
    print num_words, 'words'
    idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list,
                           num_words)
    print idfs

    train_y = ioutils.load_labels_file(train_label_file_name)
    # print train_y[:100]
    print Counter(train_y)
    test_y = ioutils.load_labels_file(test_label_file_name)
    # print test_y[:100]
    print Counter(test_y)

    print 'to sparse ...'
    train_cm = _word_cnts_to_bow_vecs(train_word_indices_list,
                                      train_word_cnts_list, num_words, idfs)
    test_cm = _word_cnts_to_bow_vecs(test_word_indices_list,
                                     test_word_cnts_list, num_words, idfs)
    # print train_cm[0]

    print 'training lr ...'
    lr = LogisticRegression(C=1000,
                            multi_class='multinomial',
                            solver='newton-cg')
    lr.fit(train_cm, train_y)
    print 'done.'

    y_pred = lr.predict(test_cm)
    # print y_pred[:100]
    print Counter(y_pred)
    # ftmp = open('e:/data/emadr/20ng_data/tmp_labels.txt', 'wb')
    # for i in xrange(len(y_pred)):
    #     ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n')
    # ftmp.close()

    acc = accuracy_score(test_y, y_pred)
    prec = precision_score(test_y, y_pred, average='macro')
    recall = recall_score(test_y, y_pred, average='macro')
    f1 = f1_score(test_y, y_pred, average='macro')
    print 'accuracy', acc
    print 'precision', prec
    print 'recall', recall
    print 'macro f1', f1
    print '%f\t%f\t%f\t%f' % (acc, prec, recall, f1)
Exemple #6
0
def __test():
    doc_labels_file = 'e:/data/emadr/nyt-world-full/processed/doc-labels.bin'
    data_split_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/data-split-labels.bin'
    train_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/train-labels.bin'
    test_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/test-labels.bin'

    doc_labels = load_labels_file(doc_labels_file)
    print len(doc_labels), doc_labels[:20]
    split_labels = load_labels_file(data_split_labels_file)
    print len(split_labels), split_labels[:20]
    train_labels = load_labels_file(train_labels_file)
    print len(train_labels), train_labels[:20]
    test_labels = load_labels_file(test_labels_file)
    print len(test_labels), test_labels[:20]
Exemple #7
0
def cluster_nyt():
    num_clusters_list = [5, 10, 15, 20]
    method = 'RSM'

    result_file = 'd:/documents/lab/paper-data/plot/%s-results-ri.csv' % method.lower()

    labels_file_name = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/de-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/de-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/glove-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw2-vecs-ner.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw2-vecs-ner-200.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw4-vecs-015.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw5-vecs-ner.bin'
    doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/rsm-vecs-20.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/drbm-vecs-30.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/pvdm-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/pvdbow-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/nvdm-nyt.bin'

    # doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/test-dedw-vecs.bin'
    # labels_file_name = 'e:/dc/20ng_bydate/test_labels.bin'

    perf_list = list()
    # for num_clusters in [5, 10, 15, 20]:
    vec_list = ioutils.load_vec_list_file(doc_vec_file_name)
    labels = ioutils.load_labels_file(labels_file_name)
    for num_clusters in num_clusters_list:
        print '%d clusters' % num_clusters
        # nmi_score, purity_score, ri_score = clustering(doc_vec_file_name, labels_file_name, num_clusters)
        nmi_score, purity_score, ri_score = cluster_and_eval(vec_list, labels, num_clusters)
        perf_list.append((num_clusters, nmi_score, purity_score, ri_score))
        # break
    write_clustering_perf_to_csv(method, perf_list, result_file)
Exemple #8
0
def lda():
    num_clusters_list = [5, 10, 15, 20]
    result_file = 'd:/documents/lab/paper-data/plot/lda-results-ri.csv'

    text_doc_file = 'e:/dc/nyt-world-full/processed/docs_tokenized_lc.txt'
    dict_file = 'e:/dc/nyt-world-full/processed/lda/all-docs.dict'
    mm_file = 'e:/dc/nyt-world-full/processed/lda/all-docs.mm'
    lda_model_file = 'e:/dc/nyt-world-full/processed/lda/lda-model'
    gold_label_file = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin'

    # __text_file_to_mm_corpus(text_doc_file, dict_file, mm_file)

    perf_list = list()
    gold_labels = ioutils.load_labels_file(gold_label_file)
    word_dict = gensim.corpora.Dictionary.load(dict_file)
    mm_corpus = gensim.corpora.MmCorpus(mm_file)
    for num_clusters in num_clusters_list:
        print num_clusters, 'clusters'
        lda_model = gensim.models.ldamodel.LdaModel(mm_corpus, id2word=word_dict, num_topics=num_clusters)
        lda_model_file = 'e:/dc/nyt-world-full/processed/lda/lda-model-%d' % num_clusters
        lda_model.save(lda_model_file)

        nmi_score, purity_score, ri_score = __eval_lda_clustering(lda_model, mm_corpus, gold_labels)
        perf_list.append((num_clusters, nmi_score, purity_score, ri_score))

    write_clustering_perf_to_csv('LDA', perf_list, result_file)
Exemple #9
0
def __lda_clustering_nyt():
    num_clusters_list = [5, 10, 15, 20]
    min_occurrence = 30
    datadir = 'e:/data/emadr/nyt-less-docs/world/'
    result_file = 'd:/documents/lab/paper-data/plot/lda-results-ri.csv'

    dict_file = os.path.join(datadir, 'lda/all-docs-%d.dict' % min_occurrence)
    mm_file = os.path.join(datadir, 'lda/all-docs-%d.mm' % min_occurrence)
    gold_label_file = 'e:/data/emadr/nyt-less-docs/world/bindata/test-labels.bin'

    # __text_file_to_mm_corpus(text_doc_file, dict_file, mm_file)

    perf_list = list()
    gold_labels = ioutils.load_labels_file(gold_label_file)
    word_dict = gensim.corpora.Dictionary.load(dict_file)
    mm_corpus = gensim.corpora.MmCorpus(mm_file)
    for num_clusters in num_clusters_list:
        print num_clusters, 'clusters'
        lda_model = gensim.models.ldamodel.LdaModel(mm_corpus,
                                                    id2word=word_dict,
                                                    num_topics=num_clusters)
        lda_model_file = 'e:/dc/nyt-world-full/processed/lda/lda-model-%d' % num_clusters
        lda_model.save(lda_model_file)

        nmi_score, purity_score, ri_score = __eval_lda_clustering(
            lda_model, mm_corpus, gold_labels)
        perf_list.append((num_clusters, nmi_score, purity_score, ri_score))

    write_clustering_perf_to_csv('LDA', perf_list, result_file)
Exemple #10
0
def __split_vecs(all_vecs_file_name, split_labels_file_name,
                 dst_train_vecs_file_name, dst_test_vecs_file_name):
    all_vec_list = ioutils.load_vec_list_file(all_vecs_file_name)
    split_labels = ioutils.load_labels_file(split_labels_file_name)

    train_vec_list = list()
    test_vec_list = list()
    for vec, split_label in zip(all_vec_list, split_labels):
        # vec = np.random.uniform(0, 1, len(vec)).astype(np.float32)
        # print split_label
        if split_label == 1:
            test_vec_list.append(vec)
        else:
            train_vec_list.append(vec)

    print len(train_vec_list), 'training samples'
    print len(test_vec_list), 'testing samples'

    def save_vecs(vec_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout)
        for cur_vec in vec_list:
            cur_vec.tofile(fout)
        fout.close()

    save_vecs(train_vec_list, dst_train_vecs_file_name)
    save_vecs(test_vec_list, dst_test_vecs_file_name)
Exemple #11
0
def __gen_class_labels():
    doc_paths_file = 'e:/data/emadr/20ng_bydate/docpaths.txt'
    dataset_labels_file = 'e:/data/emadr/20ng_bydate/bindata/dataset-split-labels.bin'
    all_docs_class_labels_file = 'e:/data/emadr/20ng_bydate/bindata/labels.bin'
    training_class_labels_file = 'e:/data/emadr/20ng_bydate/bindata/train-labels.bin'
    validation_class_labels_file = 'e:/data/emadr/20ng_bydate/bindata/val-labels.bin'
    testing_class_labels_file = 'e:/data/emadr/20ng_bydate/bindata/test-labels.bin'

    fin = open(doc_paths_file, 'r')
    docpaths = list()
    for line in fin:
        docpaths.append(line.strip())
    fin.close()

    all_labels, train_labels, val_labels, test_labels = list(), list(), list(
    ), list()
    dataset_split_labels = ioutils.load_labels_file(dataset_labels_file)
    for dataset_split_label, docpath in izip(dataset_split_labels, docpaths):
        class_label_idx = 0
        for lidx, cl in enumerate(doc_classes):
            if cl in docpath:
                class_label_idx = lidx
        print dataset_split_label, docpath, class_label_idx
        all_labels.append(class_label_idx)
        if dataset_split_label == 0:
            train_labels.append(class_label_idx)
        elif dataset_split_label == 1:
            val_labels.append(class_label_idx)
        else:
            test_labels.append(class_label_idx)

    ioutils.save_labels(all_labels, all_docs_class_labels_file)
    ioutils.save_labels(train_labels, training_class_labels_file)
    ioutils.save_labels(val_labels, validation_class_labels_file)
    ioutils.save_labels(test_labels, testing_class_labels_file)
Exemple #12
0
def split_vecs(all_vecs_file_name,
               split_labels_file_name,
               dst_train_vecs_file_name,
               dst_test_vecs_file_name,
               train_label=0,
               test_label=1):
    all_vec_list = ioutils.load_vec_list_file(all_vecs_file_name)
    split_labels = ioutils.load_labels_file(split_labels_file_name)

    train_vec_list = list()
    test_vec_list = list()
    for vec, split_label in zip(all_vec_list, split_labels):
        # vec = np.random.uniform(0, 1, len(vec)).astype(np.float32)
        # print split_label
        if split_label == test_label:
            test_vec_list.append(vec)
        elif split_label == train_label:
            train_vec_list.append(vec)

    print len(train_vec_list), 'training samples'
    print len(test_vec_list), 'testing samples'

    def save_vecs(vec_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout)
        for cur_vec in vec_list:
            cur_vec.tofile(fout)
        fout.close()

    save_vecs(train_vec_list, dst_train_vecs_file_name)
    save_vecs(test_vec_list, dst_test_vecs_file_name)
Exemple #13
0
def doc_classification_svm(train_vec_file,
                           train_label_file,
                           test_vec_file,
                           vec_beg=0,
                           vec_end=-1):
    train_x = load_features(train_vec_file)
    train_y = load_labels_file(train_label_file)

    test_x = load_features(test_vec_file)
    # print train_y[1000:1100]
    # print test_y[1000:1100]

    print train_x[0][50:60]

    def trunc_vecs(vec_list):
        for idx, vec in enumerate(vec_list):
            if vec_end != -1:
                vec_list[idx] = vec[vec_beg:vec_end]
            else:
                vec_list[idx] = vec[vec_beg:]

    if vec_beg != 0 or vec_end != -1:
        trunc_vecs(train_x)
        trunc_vecs(test_x)

    print 'training svm ...'
    clf = svm.SVC(decision_function_shape='ovo')
    # clf = svm.SVC(decision_function_shape='ovo', kernel='linear')
    # clf = svm.LinearSVC(dual=False)
    clf.fit(train_x, train_y)
    print 'done.'

    y_pred = clf.predict(test_x)
    return y_pred
Exemple #14
0
def __cluster_nyt():
    # num_clusters_list = [5, 10, 15, 20]
    num_clusters_list = [10, 15, 20]
    # num_clusters_list = [5]
    method = 'RSM'

    datadir = 'e:/data/emadr/nyt-less-docs/world'
    result_file = 'd:/documents/lab/paper-data/plot/%s-results-ri.csv' % method.lower(
    )

    labels_file_name = os.path.join(datadir, 'bindata/test-labels.bin')
    # doc_vec_file_name = os.path.join(datadir, 'vecs/test-dedw-vecs.bin')
    doc_vec_file_name = os.path.join(datadir, 'bindata/test-pvdbow-vecs.bin')
    # doc_vec_file_name = os.path.join(datadir, 'rsm/test-rsm-vecs.bin')

    perf_list = list()
    # for num_clusters in [5, 10, 15, 20]:
    vec_list = ioutils.load_vec_list_file(doc_vec_file_name)
    labels = ioutils.load_labels_file(labels_file_name)
    for num_clusters in num_clusters_list:
        print '%d clusters' % num_clusters
        # nmi_score, purity_score, ri_score = clustering(doc_vec_file_name, labels_file_name, num_clusters)
        nmi_score, purity_score, ri_score = cluster_and_eval(
            vec_list, labels, num_clusters)
        perf_list.append((num_clusters, nmi_score, purity_score, ri_score))
Exemple #15
0
def __gen_data_split_labels_tvt():
    # doc_labels_file = 'e:/data/emadr/nyt-world-full/processed/doc-labels.bin'
    # data_split_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/data-split-labels.bin'
    # train_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/train-labels.bin'
    # val_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/val-labels.bin'
    # test_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/test-labels.bin'

    main_class = 'business'
    doc_labels_file = 'e:/data/emadr/nyt-less-docs/%s/bindata/labels.bin' % main_class
    data_split_labels_file = 'e:/data/emadr/nyt-less-docs/%s/bindata/dataset-split-labels.bin' % main_class
    train_labels_file = 'e:/data/emadr/nyt-less-docs/%s/bindata/train-labels.bin' % main_class
    val_labels_file = 'e:/data/emadr/nyt-less-docs/%s/bindata/val-labels.bin' % main_class
    test_labels_file = 'e:/data/emadr/nyt-less-docs/%s/bindata/test-labels.bin' % main_class

    all_labels = load_labels_file(doc_labels_file)
    num_labels = len(all_labels)

    split_labels = np.random.randint(0, 3, num_labels)
    fout_data_split = open(data_split_labels_file, 'wb')
    np.asarray([num_labels], np.int32).tofile(fout_data_split)
    split_labels.tofile(fout_data_split)
    fout_data_split.close()

    def write_labels(cur_labels, filename):
        fout = open(filename, 'wb')
        np.asarray([len(cur_labels)], dtype=np.int32).tofile(fout)
        np.asarray(cur_labels, dtype=np.int32).tofile(fout)
        fout.close()

    train_labels = [cl for cl, sl in izip(all_labels, split_labels) if sl == 0]
    val_labels = [cl for cl, sl in izip(all_labels, split_labels) if sl == 1]
    test_labels = [cl for cl, sl in izip(all_labels, split_labels) if sl == 2]
    write_labels(train_labels, train_labels_file)
    write_labels(val_labels, val_labels_file)
    write_labels(test_labels, test_labels_file)
Exemple #16
0
def __gen_lda_features(data_split_labels_file, mm_file, lda_model_file,
                       dst_train_vecs_file, dst_val_vecs_file,
                       dst_test_vecs_file):
    data_split_labels = ioutils.load_labels_file(data_split_labels_file)
    lda_model = gensim.models.ldamodel.LdaModel.load(lda_model_file)
    mm_corpus = gensim.corpora.MmCorpus(mm_file)
    train_vecs, val_vecs, test_vecs = list(), list(), list()
    for i, (l, doc) in enumerate(izip(data_split_labels, mm_corpus)):
        topic_dist = lda_model[doc]
        vec = np.zeros(lda_model.num_topics, np.float32)
        for tup in topic_dist:
            vec[tup[0]] = tup[1]

        if l == 0:
            train_vecs.append(vec)
        elif l == 1:
            val_vecs.append(vec)
        else:
            test_vecs.append(vec)
        # print topic_dist
        # print vec
        if i % 1000 == 0:
            print i
            # break
    # print train_vecs[:5]
    __save_vecs(train_vecs, dst_train_vecs_file)
    __save_vecs(val_vecs, dst_val_vecs_file)
    __save_vecs(test_vecs, dst_test_vecs_file)
Exemple #17
0
def __get_test_doc_paths(doc_paths_file, dataset_split_labels_file):
    test_doc_paths = list()
    doc_paths = __load_doc_paths(doc_paths_file)
    split_labels = ioutils.load_labels_file(dataset_split_labels_file)
    assert len(doc_paths) == len(split_labels)
    for doc_path, sl in izip(doc_paths, split_labels):
        if sl == 2:
            test_doc_paths.append(doc_path)
    return test_doc_paths
Exemple #18
0
def bow_clustering():
    num_clusters_list = [5, 10, 15, 20]
    dw_file = 'e:/dc/nyt-world-full/processed/bin/dw-30.bin'
    gold_labels_file = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin'
    result_file = 'd:/documents/lab/paper-data/plot/bow-results-ri.csv'

    perf_list = list()
    gold_labels = ioutils.load_labels_file(gold_labels_file)
    bow_vecs = __get_bow_vecs(dw_file)
    for num_clusters in num_clusters_list:
        print num_clusters, 'clusters'
Exemple #19
0
def __eval_lda_clustering_20ng():
    text_doc_file = 'e:/dc/20ng_bydate/twe/docs-nl.txt'
    dict_file = 'e:/dc/20ng_bydate/lda/all-docs.dict'
    mm_file = 'e:/dc/20ng_bydate/lda/all-docs.mm'
    lda_model_file = 'e:/dc/20ng_bydate/lda/lda-model'

    dataset_label_file = 'e:/dc/20ng_bydate/doc_split_labels.bin'
    test_label_file = 'e:/dc/20ng_bydate/test_labels.bin'

    __text_file_to_mm_corpus(text_doc_file, dict_file, mm_file)

    __train_lda_model(dict_file, mm_file, lda_model_file)

    dataset_labels = ioutils.load_labels_file(dataset_label_file)
    lda_model = gensim.models.ldamodel.LdaModel.load(lda_model_file)
    mm_corpus = gensim.corpora.MmCorpus(mm_file)
    sys_labels = list()
    for i, doc in enumerate(mm_corpus):
        if dataset_labels[i] == 0:
            continue

        topic_dist = lda_model[doc]
        # print topic_dist
        cluster_idx = 0
        max_dist = 0
        for tup in topic_dist:
            if tup[1] > max_dist:
                cluster_idx = tup[0]
                max_dist = tup[1]
        sys_labels.append(cluster_idx)
        if len(sys_labels) % 1000 == 0:
            print len(sys_labels)
        # if i > 10:
        #     break
    print len(sys_labels)
    gold_labels = ioutils.load_labels_file(test_label_file)
    print len(gold_labels)
    print normalized_mutual_info_score(gold_labels, sys_labels)
    print cluster_accuracy(gold_labels, sys_labels)
Exemple #20
0
def __eval_lda_clustering_20ng():
    text_doc_file = 'e:/dc/20ng_bydate/twe/docs-nl.txt'
    dict_file = 'e:/dc/20ng_bydate/lda/all-docs.dict'
    mm_file = 'e:/dc/20ng_bydate/lda/all-docs.mm'
    lda_model_file = 'e:/dc/20ng_bydate/lda/lda-model'

    dataset_label_file = 'e:/dc/20ng_bydate/doc_split_labels.bin'
    test_label_file = 'e:/dc/20ng_bydate/test_labels.bin'

    # __text_file_to_mm_corpus(text_doc_file, dict_file, mm_file)

    __train_lda_model(dict_file, mm_file, lda_model_file)

    dataset_labels = ioutils.load_labels_file(dataset_label_file)
    lda_model = gensim.models.ldamodel.LdaModel.load(lda_model_file)
    mm_corpus = gensim.corpora.MmCorpus(mm_file)
    sys_labels = list()
    for i, doc in enumerate(mm_corpus):
        if dataset_labels[i] == 0:
            continue

        topic_dist = lda_model[doc]
        # print topic_dist
        cluster_idx = 0
        max_dist = 0
        for tup in topic_dist:
            if tup[1] > max_dist:
                cluster_idx = tup[0]
                max_dist = tup[1]
        sys_labels.append(cluster_idx)
        if len(sys_labels) % 1000 == 0:
            print len(sys_labels)
        # if i > 10:
        #     break
    print len(sys_labels)
    gold_labels = ioutils.load_labels_file(test_label_file)
    print len(gold_labels)
    print normalized_mutual_info_score(gold_labels, sys_labels)
    print cluster_accuracy(gold_labels, sys_labels)
Exemple #21
0
def __cluster_20ng():
    num_clusters = 20
    labels_file = 'e:/data/emadr/20ng_bydate/bindata/test-labels.bin'
    # doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-dedw-vecs.bin'
    # doc_vec_file = 'e:/data/emadr/20ng_bydate/vecs/dew-vecs-0_8-50.bin'
    # doc_vec_file = 'e:/data/emadr/20ng_bydate/vecs/dew-vecs-cluster-0_15-50.bin'
    # doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-pvdbow-vecs.bin'
    doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-pvdm-vecs.bin'
    # doc_vec_file = 'e:/data/emadr/20ng_bydate/rsm/test-rsm-vecs.bin'

    vec_list = ioutils.load_vec_list_file(doc_vec_file)
    labels = ioutils.load_labels_file(labels_file)
    nmi_score, purity_score, ri_score = cluster_and_eval(
        vec_list, labels, num_clusters)
    print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score)
Exemple #22
0
def split_docs_text_file_by_dataset_labels(doc_text_file, dataset_split_file,
                                           dst_train_doc_text_file,
                                           dst_test_doc_text_file):
    data_split_labels = load_labels_file(dataset_split_file)
    print data_split_labels[:10]
    print len(data_split_labels)
    fin = open(doc_text_file, 'r')
    ftrain = open(dst_train_doc_text_file, 'wb')
    ftest = open(dst_test_doc_text_file, 'wb')
    for l, line in izip(data_split_labels, fin):
        if l == 0:
            ftrain.write(line)
        else:
            ftest.write(line)
    fin.close()
    ftrain.close()
    ftest.close()
Exemple #23
0
def __bow_clustering():
    # num_clusters_list = [5, 10, 15, 20]
    num_clusters_list = [5]

    dw_file = 'e:/data/emadr/nyt-less-docs/world/bindata/dw-test-30.bin'
    gold_labels_file = 'e:/data/emadr/nyt-less-docs/world/bindata/test-labels.bin'
    result_file = 'd:/documents/lab/paper-data/plot/bow-results-ri-bak.csv'

    # dw_file = 'e:/data/emadr/20ng_bydate/bindata/dw-test-30.bin'
    # gold_labels_file = 'e:/data/emadr/20ng_bydate/bindata/test-labels.bin'
    # result_file = 'd:/documents/lab/paper-data/plot/bow-results-20ng.csv'

    perf_list = list()
    gold_labels = ioutils.load_labels_file(gold_labels_file)
    print len(gold_labels), gold_labels[:10]
    bow_vecs = __get_bow_vecs(dw_file)
    print bow_vecs.shape
    for num_clusters in num_clusters_list:
        print num_clusters, 'clusters'
        nmi_score, purity_score, ri_score = bow_kmeans(bow_vecs, gold_labels,
                                                       num_clusters)
        perf_list.append((num_clusters, nmi_score, purity_score, ri_score))
Exemple #24
0
def split_vectors(all_vec_file_name, all_labels_file_name,
                  dst_train_vec_file_name, dst_train_labels_file_name,
                  dst_test_vec_file_name, dst_test_labels_file_name):
    all_vec_list = ioutils.load_vec_list_file(all_vec_file_name)
    all_labels = ioutils.load_labels_file(all_labels_file_name)

    train_vec_list = list()
    train_labels = list()
    test_vec_list = list()
    test_labels = list()
    for vec, label in zip(all_vec_list, all_labels):
        rand_val = random.randint(1, 10)
        if rand_val == 1:
            test_vec_list.append(vec)
            test_labels.append(label)
        else:
            train_vec_list.append(vec)
            train_labels.append(label)

    print len(train_labels), 'training samples'
    print len(test_labels), 'testing samples'

    def save_vecs(vec_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout)
        for cur_vec in vec_list:
            cur_vec.tofile(fout)
        fout.close()

    def save_labels(labels_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(labels_list)], np.int32).tofile(fout)
        np.asarray(labels_list, np.int32).tofile(fout)
        fout.close()

    save_vecs(train_vec_list, dst_train_vec_file_name)
    save_labels(train_labels, dst_train_labels_file_name)
    save_vecs(test_vec_list, dst_test_vec_file_name)
    save_labels(test_labels, dst_test_labels_file_name)
Exemple #25
0
def split_vectors(all_vec_file_name, all_labels_file_name, dst_train_vec_file_name,
                  dst_train_labels_file_name, dst_test_vec_file_name, dst_test_labels_file_name):
    all_vec_list = ioutils.load_vec_list_file(all_vec_file_name)
    all_labels = ioutils.load_labels_file(all_labels_file_name)

    train_vec_list = list()
    train_labels = list()
    test_vec_list = list()
    test_labels = list()
    for vec, label in zip(all_vec_list, all_labels):
        rand_val = random.randint(1, 10)
        if rand_val == 1:
            test_vec_list.append(vec)
            test_labels.append(label)
        else:
            train_vec_list.append(vec)
            train_labels.append(label)

    print len(train_labels), 'training samples'
    print len(test_labels), 'testing samples'

    def save_vecs(vec_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout)
        for cur_vec in vec_list:
            cur_vec.tofile(fout)
        fout.close()

    def save_labels(labels_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(labels_list)], np.int32).tofile(fout)
        np.asarray(labels_list, np.int32).tofile(fout)
        fout.close()

    save_vecs(train_vec_list, dst_train_vec_file_name)
    save_labels(train_labels, dst_train_labels_file_name)
    save_vecs(test_vec_list, dst_test_vec_file_name)
    save_labels(test_labels, dst_test_labels_file_name)
Exemple #26
0
def __lda_clustering():
    num_topics = 20
    min_occurrence = 30
    # datadir = 'e:/data/emadr/20ng_bydate/'
    # labels_file = os.path.join(datadir, 'bindata/test-labels.bin')
    # topic_vecs_file = os.path.join(datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence))
    datadir = 'e:/data/emadr/nyt-less-docs/world'
    labels_file = os.path.join(datadir, 'bindata/test-labels.bin')
    topic_vecs_file = os.path.join(
        datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence))

    topic_vecs = ioutils.load_vec_list_file(topic_vecs_file)
    gold_labels = ioutils.load_labels_file(labels_file)
    sys_labels = list()
    for i, topic_vec in enumerate(topic_vecs):
        cluster_idx = 0
        max_dist = 0
        for j, v in enumerate(topic_vec):
            if v > max_dist:
                cluster_idx = j
                max_dist = v
        # print cluster_idx, max_dist
        sys_labels.append(cluster_idx)
        if len(sys_labels) % 5000 == 0:
            print len(sys_labels)

    nmi_score = normalized_mutual_info_score(gold_labels, sys_labels)
    purity_score = purity(gold_labels, sys_labels)
    # ri_score = rand_index(gold_labels, sys_labels)
    ri_score = 0

    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score,
                                                 ri_score)
    # print 'Accuracy: %f' % cluster_accuracy(labels, model.labels_)

    print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score)
Exemple #27
0
def get_scores_label_file(true_label_file, y_pred):
    y_true = load_labels_file(true_label_file)
    return get_scores(y_true, y_pred)