def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('train_doc_codes',
                        type=str,
                        help='path to the train doc code file')
    parser.add_argument('val_doc_codes',
                        type=str,
                        help='path to the valid doc code file')
    parser.add_argument('out_dir', type=str, help='path to the output dir')
    args = parser.parse_args()

    train_doc_codes_path = args.train_doc_codes
    train_doc_codes = load_json(train_doc_codes_path)
    val_doc_codes = load_json(args.val_doc_codes)

    # import pdb;pdb.set_trace()

    train_doc_codes.update(val_doc_codes)

    out_dir = args.out_dir
    dump_json(
        train_doc_codes,
        os.path.join(out_dir, 'new_' + os.path.basename(train_doc_codes_path)))

    import pdb
    pdb.set_trace()
Ejemplo n.º 2
0
def train(args):
    vocab = load_json(args.vocab)
    # import pdb;pdb.set_trace()
    # load corpus
    corpus = CorpusIter20News(args.corpus[0],
                              recursive=True,
                              stem=True,
                              with_docname=True)
    # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True)
    corpus_iter = lambda: (TaggedDocument(
        [word for word in sentence if word in vocab], tag)
                           for sentence, tag in corpus)

    d2v = MyDoc2Vec(args.n_dim, window=args.window_size, \
        negative=args.negative, epoches=args.n_epoch, dm_concat=1)

    start = timeit.default_timer()
    d2v.train(corpus_iter)
    print 'runtime: %ss' % (timeit.default_timer() - start)

    save_doc2vec(d2v.model, args.save_model)
    import pdb
    pdb.set_trace()
def get_words(args):
    corpus = load_corpus(args.input_corpus)
    filename_corpus_dict = corpus['docs']
    vocab_dict = corpus['vocab']
    
    # we have to revort the dict
    dictionary = dict((v,k) for k, v in vocab_dict.iteritems())

    filename_label_dict = load_json(args.input_label)

    print 'Finish loading data'

    label_vocab_dict = {}

    # start counting words
    for filename in filename_corpus_dict:
        vocab_num_dict = filename_corpus_dict[filename]
        label = filename_label_dict[filename]
        try:
            label_vocab_dict[label]
        except:
            label_vocab_dict[label] = {}
        for vocab in vocab_num_dict:
            num = vocab_num_dict[vocab]
            # print 'If num is a int? : ', isinstance(num, int)
            try:
                label_vocab_dict[label][vocab] += num
            except:
                label_vocab_dict[label][vocab] = num

    print 'Finish counting word frequence'

    label_topword_dict = {}
    label_num = len(label_topword_dict)
    print 'Label num is ', label_num
    topn = args.topn
    for label in label_vocab_dict:
        vocab_num_dict = label_vocab_dict[label]
        label_topword_dict[label] = sorted(vocab_num_dict, key = vocab_num_dict.__getitem__, reverse = True)[:topn]

    print 'Finish sorting the top n word'

    dump_json(label_topword_dict, args.output_json)
    print 'Finish write the json file'

    for label in label_topword_dict:
        filename_o = args.output_dir + 'label-' + str(label) + '.txt'
        print 'filename =' , filename_o
        file_o = open(filename_o, 'w')
        for word_index in label_topword_dict[label]:
            # print 'Is word_index a int:', isinstance(word_index, int)
            text = dictionary[int(word_index)]
            text += '\n'
            file_o.write(text.encode('utf-8'))
        file_o.close()
    print 'Finish writing files!'
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-l',
                        '--label',
                        type=str,
                        required=True,
                        help='path to the input label file')
    parser.add_argument('-c',
                        '--corpus',
                        type=str,
                        required=True,
                        help='path to the constructed corpus file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='path to the output file')
    args = parser.parse_args()

    extract_labels(
        load_json(args.corpus)['docs'], load_json(args.label), args.output)
Ejemplo n.º 5
0
def main():
    parser T= argparse.ArgumentParser()
    parser.add_argument('-i', '--input', type=str, required=True, help='path to the input corpus dir')
    parser.add_argument('-o', '--output', type=str, default='./', help='path to the output dir')
    parser.add_argument('-wl', '--whitelist', type=str, help='path to the whitelist file')
    args = parser.parse_args()

    if args.whitelist:
        white_list = load_json(args.whitelist)
    else:
        white_list = None

    xml2text(args.input, args.output, white_list)
Ejemplo n.º 6
0
def test(args):
    vocab = load_json(args.vocab)
    # load corpus
    corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True)
    # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True)
    corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus)

    d2v = load_doc2vec(args.load_model)
    doc_codes = predict(d2v, corpus_iter)
    dump_json(doc_codes, args.output)
    import pdb;pdb.set_trace()
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('corpus', type=str, help='path to the corpus file')
    parser.add_argument('labels', type=str, help='path to the labels file')
    parser.add_argument('-bs',
                        '--batch_size',
                        type=int,
                        default=100,
                        help='batch size (default 100)')
    parser.add_argument('out_dir', type=str, help='path to the output dir')
    args = parser.parse_args()

    corpus = load_corpus(args.corpus)
    doc_labels = load_json(args.labels)
    vocab, docs = corpus['vocab'], corpus['docs']
    n_vocab = len(vocab)
    doc_names = docs.keys()
    X_docs = [doc2vec(x, n_vocab) for x in docs.values()]

    out_dir = args.out_dir
    # attributes
    attrs = zip(*sorted(vocab.items(), key=lambda d: [1]))[0]
    dump_pickle(attrs, os.path.join(out_dir, 'attributes.p'))

    # batches
    bs = args.batch_size
    batches = [bs * (x + 1) for x in range(int(len(docs) / bs) - 1)]
    batches.append(len(docs))
    dump_pickle(batches, os.path.join(out_dir, 'batches.p'))

    # bow_batch_x
    for i in range(len(batches)):
        dump_pickle(X_docs[batches[i - 1] if i > 0 else 0:batches[i]],
                    os.path.join(out_dir, 'bow_batch_%s.p' % batches[i]))

    # # docs_names_batch_x
    # for i in range(len(batches)):
    #     dump_pickle(doc_names[batches[i - 1] if i > 0 else 0: batches[i]], os.path.join(out_dir, 'docs_names_batch_%s.p' % batches[i]))

    # class_indices_batch_x
    for i in range(len(batches)):
        data = [
            doc_labels[doc_names[idx]]
            for idx in range(batches[i - 1] if i > 0 else 0, batches[i])
        ]
        dump_pickle(
            data, os.path.join(out_dir,
                               'class_indices_batch_%s.p' % batches[i]))

    import pdb
    pdb.set_trace()
Ejemplo n.º 8
0
def train(args):
    vocab = load_json(args.vocab)
    # import pdb;pdb.set_trace()
    # load corpus
    corpus = CorpusIter20News(args.corpus[0],
                              recursive=True,
                              stem=True,
                              with_docname=False)
    # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=False)
    # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=False)
    # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=False)
    # print len([1 for x in corpus])
    corpus_iter = lambda: ([word for word in sentence if word in vocab]
                           for sentence in corpus)
    w2v = Word2Vec(args.n_dim, window=args.window_size, \
        negative=args.negative, epoches=args.n_epoch)

    start = timeit.default_timer()
    w2v.train(corpus_iter)
    print 'runtime: %ss' % (timeit.default_timer() - start)

    save_w2v(w2v.model, args.save_model)
    import pdb
    pdb.set_trace()
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('train_doc_codes',
                        type=str,
                        help='path to the train doc codes file')
    parser.add_argument('train_doc_labels',
                        type=str,
                        help='path to the train doc labels file')
    parser.add_argument('test_doc_codes',
                        type=str,
                        help='path to the test doc codes file')
    parser.add_argument('test_doc_labels',
                        type=str,
                        help='path to the test doc labels file')
    parser.add_argument('-nv',
                        '--n_val',
                        type=int,
                        default=1000,
                        help='size of validation set (default 1000)')
    parser.add_argument('-ne',
                        '--n_epoch',
                        type=int,
                        default=100,
                        help='num of epoches (default 100)')
    parser.add_argument('-bs',
                        '--batch_size',
                        type=int,
                        default=100,
                        help='batch size (default 100)')
    parser.add_argument('-cv',
                        '--cross_validation',
                        type=int,
                        help='k-fold cross validation')
    args = parser.parse_args()

    # autoencoder
    train_doc_codes = load_json(args.train_doc_codes)
    train_doc_labels = load_json(args.train_doc_labels)
    test_doc_codes = load_json(args.test_doc_codes)
    test_doc_labels = load_json(args.test_doc_labels)
    X_train = np.r_[train_doc_codes.values()]
    Y_train = np.array([train_doc_labels[i] for i in train_doc_codes])
    X_test = np.r_[test_doc_codes.values()]
    Y_test = np.array([test_doc_labels[i] for i in test_doc_codes])

    # # DBN
    # X_train = np.array(load_pickle(args.train_doc_codes))
    # Y_train = load_pickle(args.train_doc_labels)
    # X_test = np.array(load_pickle(args.test_doc_codes))
    # Y_test = load_pickle(args.test_doc_labels)

    seed = 7
    np.random.seed(seed)
    if not args.cross_validation:
        val_idx = np.random.choice(range(X_train.shape[0]),
                                   args.n_val,
                                   replace=False)
        train_idx = list(set(range(X_train.shape[0])) - set(val_idx))
        X_new_train = X_train[train_idx]
        Y_new_train = Y_train[train_idx]
        X_new_val = X_train[val_idx]
        Y_new_val = Y_train[val_idx]
        print 'train: %s, val: %s, test: %s' % (
            X_new_train.shape[0], X_new_val.shape[0], X_test.shape[0])

        results = neural_regression(X_new_train, Y_new_train, X_new_val, Y_new_val, \
                X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)
        print 'r2 score on test set: %s' % results
    else:
        X = np.concatenate((X_train, X_test), axis=0)
        Y = np.concatenate((Y_train, Y_test), axis=0)
        ss = ShuffleSplit(n_splits=int(args.cross_validation),
                          test_size=X_test.shape[0],
                          random_state=seed)
        results = []
        for train_idx, test_idx in ss.split(X):
            val_idx = np.random.choice(train_idx, args.n_val, replace=False)
            new_train_idx = list(set(train_idx) - set(val_idx))
            X_new_train = X[new_train_idx]
            Y_new_train = Y[new_train_idx]
            X_new_val = X[val_idx]
            Y_new_val = Y[val_idx]
            results.append(neural_regression(X_new_train, Y_new_train, X_new_val, Y_new_val, \
                X[test_idx], Y[test_idx], nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed))
        mean = np.mean(results)
        std = np.std(results)
        print 'r2 score on %s cross validation: %s (%s)' % (int(
            args.cross_validation), mean, std)
    import pdb
    pdb.set_trace()
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('train_doc_codes',
                        type=str,
                        help='path to the train doc codes file')
    parser.add_argument('train_doc_labels',
                        type=str,
                        help='path to the train doc codes file')
    parser.add_argument('val_doc_codes',
                        type=str,
                        help='path to the train doc codes file')
    parser.add_argument('val_doc_labels',
                        type=str,
                        help='path to the train doc labels file')
    parser.add_argument('test_doc_codes',
                        type=str,
                        help='path to the test doc codes file')
    parser.add_argument('test_doc_labels',
                        type=str,
                        help='path to the test doc labels file')
    parser.add_argument('-ne',
                        '--n_epoch',
                        type=int,
                        default=100,
                        help='num of epoches (default 100)')
    parser.add_argument('-bs',
                        '--batch_size',
                        type=int,
                        default=100,
                        help='batch size (default 100)')
    parser.add_argument('-mlc',
                        '--multilabel_clf',
                        action='store_true',
                        help='multilabel classification flag')

    args = parser.parse_args()

    # autoencoder
    train_doc_codes = load_json(args.train_doc_codes)
    train_doc_labels = load_json(args.train_doc_labels)
    val_doc_codes = load_json(args.val_doc_codes)
    val_doc_labels = load_json(args.val_doc_labels)
    test_doc_codes = load_json(args.test_doc_codes)
    test_doc_labels = load_json(args.test_doc_labels)
    X_train = np.r_[train_doc_codes.values()]
    Y_train = [train_doc_labels[i] for i in train_doc_codes]
    X_val = np.r_[val_doc_codes.values()]
    Y_val = [val_doc_labels[i] for i in val_doc_codes]
    X_test = np.r_[test_doc_codes.values()]
    Y_test = [test_doc_labels[i] for i in test_doc_codes]

    # # DBN
    # X_train = np.array(load_pickle(args.train_doc_codes))
    # Y_train = load_pickle(args.train_doc_labels)
    # X_val = np.array(load_pickle(args.val_doc_codes))
    # Y_val = load_pickle(args.val_doc_labels)
    # X_test = np.array(load_pickle(args.test_doc_codes))
    # Y_test = load_pickle(args.test_doc_labels)

    if args.multilabel_clf:
        encoder = MultiLabelBinarizer()
        encoder.fit(Y_train + Y_val + Y_test)
        Y_train = encoder.transform(Y_train)
        Y_val = encoder.transform(Y_val)
        Y_test = encoder.transform(Y_test)
    else:
        Y = Y_train + Y_val + Y_test
        n_train = len(Y_train)
        n_val = len(Y_val)
        n_test = len(Y_test)
        encoder = LabelEncoder()
        Y = np_utils.to_categorical(encoder.fit_transform(Y))
        Y_train = Y[:n_train]
        Y_val = Y[n_train:n_train + n_val]
        Y_test = Y[-n_test:]

    seed = 7
    print('train: %s, val: %s, test: %s' %
          (X_train.shape[0], X_val.shape[0], X_test.shape[0]))
    if args.multilabel_clf:
        results = multilabel_classifier(X_train, Y_train, X_val, Y_val, \
                X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)
        print('f1 score on test set: macro_f1: %s, micro_f1: %s' %
              tuple(results))
    else:
        results = multiclass_classifier(X_train, Y_train, X_val, Y_val, \
                X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)
        print('acc on test set: %s' % results)

    import pdb
    pdb.set_trace()
def get_word_relationship(args):
    corpus = load_corpus(args.input_corpus)
    doc_vec_dict = corpus['docs']
    vocab_dict = corpus['vocab']
    print 'Load corpus'

    # we have to revort the dict
    dictionary = dict((v,k) for k, v in vocab_dict.iteritems())

    # Here the input top words path is the json file of the label-topwords_ls 
    # should be a dict, each key is a label and its value is the list of top words
    top_words_path = args.input_topwords
    label_topwordls = load_json(top_words_path)
    print 'Load top words of each label'

    label_topwords_vocabnum_dict = {}
    label_topwordindexls_dict = {}
    for label in label_topwordls:
        label_topwords_vocabnum_dict[label] = {}
        topwords_index_ls = []
        for word in label_topwordls[label]:
            topwords_index_ls.append(word)
            label_topwords_vocabnum_dict[label][word] = {}
        label_topwordindexls_dict[label] = topwords_index_ls

    print 'Finish change words into index'

    # in order to save memory and speed it up, I only calculate the word-words frequency of those 
    # in the top word list

    for label in label_topwordindexls_dict:
        print 'Doing label', str(label)
        topwords_idx_set = set(label_topwordindexls_dict[label])

        for filename in doc_vec_dict:
            word_vec_dict = doc_vec_dict[filename]
            result_word_ls = get_word_list(word_vec_dict, topwords_idx_set)
            for word in result_word_ls:
                for doc_word in word_vec_dict:
                    try:
                        label_topwords_vocabnum_dict[label][word][doc_word] += word_vec_dict[doc_word]
                    except:
                        label_topwords_vocabnum_dict[label][word][doc_word] = word_vec_dict[doc_word]

    print 'Finish building the dict of label-topwords-words-num!'

    # now we should get the top of words

    topn = args.topn

    # it is a dict-dict-ls ({label:{words:[top_relative words]}})
    label_topwords_relativewords = {}
    for label in label_topwords_vocabnum_dict:
        label_topwords_relativewords[label] = {}
        for word in label_topwords_vocabnum_dict[label]:
            vocab_num_dict = label_topwords_vocabnum_dict[label][word]
            label_topwords_relativewords[label][word] = sorted(vocab_num_dict,
                key=vocab_num_dict.__getitem__, reverse = True)[:topn]

    print 'Finish sorting the top n word'

    dump_json(label_topwords_relativewords, args.output_json)
    print 'Finish write the json file'

    for label in label_topwords_relativewords:
        filename_o = args.output_dir + 'label-' + str(label) + '.txt'
        print 'filename =' , filename_o
        file_o = open(filename_o, 'w')
        for word_index in label_topwords_relativewords[label]:
            # print 'Is word_index a int:', isinstance(word_index, int)
            text = dictionary[int(word_index)]
            text += ': '
            for top_relative_wordidx in label_topwords_relativewords[label][word_index]:
                text += dictionary[int(top_relative_wordidx)]
                text += ', '
            text += '\n'
            file_o.write(text.encode('utf-8'))
        file_o.close()
    print 'Finish writing files!'
Ejemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('doc_codes_file', type=str, help='path to the input corpus file')
    parser.add_argument('doc_labels_file', type=str, help='path to the output doc codes file')
    parser.add_argument('cmd', choices=['pca', 'tsne'], help='plot cmd')
    parser.add_argument('-o', '--output', type=str, default='out.png', help='path to the output file')
    args = parser.parse_args()

    cmd = args.cmd.lower()

    # classes_to_visual = ["rec.sport.hockey", "comp.graphics", "sci.crypt", \
    #                         "soc.religion.christian", "talk.politics.mideast", \
    #                         "talk.politics.guns"]

    # classes_to_visual = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
                        # 'comp.sys.mac.hardware', 'comp.windows.x']
    # classes_to_visual = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

    doc_codes = load_json(args.doc_codes_file)
    # doc_labels = load_json(args.doc_labels_file)

    # 20news
    # if cmd == 'pca':
    #     visualize_pca_2d(doc_codes, doc_labels, classes_to_visual, args.output)
    # elif cmd == 'tsne':
    #     plot_tsne(doc_codes, doc_labels, classes_to_visual, args.output)

    # # 8k
    # classes_to_visual = ["1", "2", "3", "4", "5", "7", "8"]
    # for k in doc_labels:
    #     doc_labels[k] = doc_labels[k].split('.')[0]

    # 10k
    # classes_to_visual = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"]
    # for k in doc_labels:
    #     doc_labels[k] = ''.join([y for y in list(doc_labels[k]) if y.isdigit()])


    # bank_topic
    import numpy as np
    doc_labels = {}

    bank_year = True
    if not bank_year:
        with open(args.doc_labels_file, 'r') as f:
            for each in f:
                tmp = each.strip().split(',')
                doc_labels[tmp[0]] = 'NF' if tmp[1] == 'NA' else 'F'
    else:
        safe_threshold = 0
        bank_record = {}
        with open(args.doc_labels_file, 'r') as f:
            for each in f:
                tmp = each.strip().split(',')
                bank_record[tmp[0]] = tmp[1]
        for key in doc_codes:
            bank, year = key.split('_')
            doc_labels[key] = 'NF' if bank_record[bank] == 'NA' or (int(bank_record[bank]) - safe_threshold > int(year)) else 'F'
    # dump_json(doc_labels, 'bank_year.labels')

    classes_to_visual = ["NF", "F"]
    maker_size = [10, 120]
    opaque = [.2, 1]
    if cmd == 'pca':
        visualize_pca_3d(doc_codes, doc_labels, classes_to_visual, args.output, maker_size, opaque)
    elif cmd == 'tsne':
        plot_tsne_3d(doc_codes, doc_labels, classes_to_visual, args.output)
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('train_doc_codes',
                        type=str,
                        help='path to the train doc codes file')
    parser.add_argument('train_doc_labels',
                        type=str,
                        help='path to the train doc labels file')
    parser.add_argument('test_doc_codes',
                        type=str,
                        help='path to the test doc codes file')
    parser.add_argument('test_doc_labels',
                        type=str,
                        help='path to the test doc labels file')
    parser.add_argument('-nv',
                        '--n_val',
                        type=int,
                        default=1000,
                        help='size of validation set (default 1000)')
    parser.add_argument(
        '-qi',
        '--query_info',
        type=str,
        help='path to the query corpus (for geting doc length info)')
    parser.add_argument('-ml',
                        '--multilabel',
                        action='store_true',
                        help='multilabel flag')
    args = parser.parse_args()

    # autoencoder
    train_doc_codes = load_json(args.train_doc_codes)
    train_doc_labels = load_json(args.train_doc_labels)
    test_doc_codes = load_json(args.test_doc_codes)
    test_doc_labels = load_json(args.test_doc_labels)
    X_train = np.r_[train_doc_codes.values()]
    Y_train = np.array([train_doc_labels[i] for i in train_doc_codes])
    X_test = np.r_[test_doc_codes.values()]
    Y_test = np.array([test_doc_labels[i] for i in test_doc_codes])

    # # DocNADE
    # train_doc_codes = load_json(args.train_doc_codes)
    # train_doc_labels = load_json(args.train_doc_labels)
    # test_doc_codes = load_json(args.test_doc_codes)
    # test_doc_labels = load_json(args.test_doc_labels)
    # X_train = []
    # for each in train_doc_codes.values():
    #     X_train.append([float(x) for x in each])
    # X_test = []
    # for each in test_doc_codes.values():
    #     X_test.append([float(x) for x in each])

    # X_train = np.r_[X_train]
    # Y_train = np.array([train_doc_labels[i] for i in train_doc_codes])
    # X_test = np.r_[X_test]
    # Y_test = np.array([test_doc_labels[i] for i in test_doc_codes])

    # # DBN
    # X_train = np.array(load_marshal(args.train_doc_codes))
    # Y_train = np.array(load_marshal(args.train_doc_labels))
    # X_test = np.array(load_marshal(args.test_doc_codes))
    # Y_test = np.array(load_marshal(args.test_doc_labels))

    seed = 7
    np.random.seed(seed)
    val_idx = np.random.choice(range(X_train.shape[0]),
                               args.n_val,
                               replace=False)
    train_idx = list(set(range(X_train.shape[0])) - set(val_idx))
    X_new_train = X_train[train_idx]
    Y_new_train = Y_train[train_idx]
    X_new_val = X_train[val_idx]
    Y_new_val = Y_train[val_idx]
    print 'train: %s, val: %s, test: %s' % (
        X_new_train.shape[0], X_new_val.shape[0], X_test.shape[0])

    results = retrieval(X_new_train, Y_new_train, X_new_val, Y_new_val,\
                        fractions=[0.001], multilabel=args.multilabel)
    print 'precision on val set: %s' % results

    if not args.query_info:
        results = retrieval(X_train, Y_train, X_test, Y_test,\
                        fractions=[0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0], multilabel=args.multilabel)
    else:
        query_docs = load_corpus(args.query_info)['docs']
        len_test = [sum(query_docs[i].values()) for i in test_doc_codes]
        results = retrieval_by_doclength(X_train,
                                         Y_train,
                                         X_test,
                                         Y_test,
                                         len_test,
                                         fraction=0.001,
                                         multilabel=args.multilabel)
    print 'precision on test set: %s' % results
    import pdb
    pdb.set_trace()
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('train_doc_codes',
                        type=str,
                        help='path to the train doc codes file')
    parser.add_argument('train_doc_labels',
                        type=str,
                        help='path to the train doc labels file')
    parser.add_argument('test_doc_codes',
                        type=str,
                        help='path to the test doc codes file')
    parser.add_argument('test_doc_labels',
                        type=str,
                        help='path to the test doc labels file')
    parser.add_argument('-nv',
                        '--n_val',
                        type=int,
                        default=1000,
                        help='size of validation set (default 1000)')
    parser.add_argument('-ne',
                        '--n_epoch',
                        type=int,
                        default=100,
                        help='num of epoches (default 100)')
    parser.add_argument('-bs',
                        '--batch_size',
                        type=int,
                        default=100,
                        help='batch size (default 100)')
    parser.add_argument('-cv',
                        '--cross_validation',
                        type=int,
                        help='k-fold cross validation')
    parser.add_argument('-mlc',
                        '--multilabel_clf',
                        action='store_true',
                        help='multilabel classification flag')

    args = parser.parse_args()

    # autoencoder
    train_doc_codes = load_json(args.train_doc_codes)
    train_doc_labels = load_json(args.train_doc_labels)
    test_doc_codes = load_json(args.test_doc_codes)
    test_doc_labels = load_json(args.test_doc_labels)
    X_train = np.r_[train_doc_codes.values()]
    Y_train = [train_doc_labels[i] for i in train_doc_codes]
    X_test = np.r_[test_doc_codes.values()]
    Y_test = [test_doc_labels[i] for i in test_doc_codes]

    # # DBN
    # X_train = np.array(load_pickle(args.train_doc_codes))
    # Y_train = load_pickle(args.train_doc_labels)
    # X_test = np.array(load_pickle(args.test_doc_codes))
    # Y_test = load_pickle(args.test_doc_labels)
    # import pdb;pdb.set_trace()

    if args.multilabel_clf:
        encoder = MultiLabelBinarizer()
        encoder.fit(Y_train + Y_test)
        Y_train = encoder.transform(Y_train)
        Y_test = encoder.transform(Y_test)
    else:
        Y = Y_train + Y_test
        n_train = len(Y_train)
        n_test = len(Y_test)
        encoder = LabelEncoder()
        Y = np_utils.to_categorical(encoder.fit_transform(Y))
        Y_train = Y[:n_train]
        Y_test = Y[-n_test:]

    seed = 7
    np.random.seed(seed)
    if not args.cross_validation:
        val_idx = np.random.choice(range(X_train.shape[0]),
                                   args.n_val,
                                   replace=False)
        train_idx = list(set(range(X_train.shape[0])) - set(val_idx))
        X_new_train = X_train[train_idx]
        Y_new_train = Y_train[train_idx]
        X_new_val = X_train[val_idx]
        Y_new_val = Y_train[val_idx]
        print 'train: %s, val: %s, test: %s' % (
            X_new_train.shape[0], X_new_val.shape[0], X_test.shape[0])
        if args.multilabel_clf:
            results = multilabel_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \
                    X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)
            print 'f1 score on test set: macro_f1: %s, micro_f1: %s' % tuple(
                results)
        else:
            results = multiclass_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \
                    X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)
            print 'acc on test set: %s' % results
    else:
        X = np.concatenate((X_train, X_test), axis=0)
        Y = np.concatenate((Y_train, Y_test), axis=0)
        ss = ShuffleSplit(n_splits=int(args.cross_validation),
                          test_size=X_test.shape[0],
                          random_state=seed)
        results = []
        for train_idx, test_idx in ss.split(X):
            val_idx = np.random.choice(train_idx, args.n_val, replace=False)
            new_train_idx = list(set(train_idx) - set(val_idx))
            X_new_train = X[new_train_idx]
            Y_new_train = Y[new_train_idx]
            X_new_val = X[val_idx]
            Y_new_val = Y[val_idx]
            if args.multilabel_clf:
                results.append(multilabel_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \
                        X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed))
            else:
                results.append(multiclass_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \
                    X[test_idx], Y[test_idx], nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed))

        if args.multilabel_clf:
            macro_f1, micro_f1 = zip(*results)
            macro_mean = np.mean(macro_f1)
            macro_std = np.std(macro_f1)
            micro_mean = np.mean(micro_f1)
            micro_std = np.std(micro_f1)
            print 'f1 score on %s-fold cross validation: macro_f1: %s (%s), micro_f1: %s (%s)' \
                    % (int(args.cross_validation), macro_mean, macro_std, micro_mean, micro_std)
        else:
            mean = np.mean(results)
            std = np.std(results)
            print 'acc on %s-fold cross validation: %s (%s)' % (int(
                args.cross_validation), mean, std)
    import pdb
    pdb.set_trace()
Ejemplo n.º 15
0
# def main():
# parser = argparse.ArgumentParser()
# parser.add_argument('train_doc_codes', type=str, help='path to the train doc codes file')
# parser.add_argument('train_doc_labels', type=str, help='path to the train doc codes file')
# parser.add_argument('val_doc_codes', type=str, help='path to the train doc codes file')
# parser.add_argument('val_doc_labels', type=str, help='path to the train doc labels file')
# parser.add_argument('test_doc_codes', type=str, help='path to the test doc codes file')
# parser.add_argument('test_doc_labels', type=str, help='path to the test doc labels file')
# parser.add_argument('-ne', '--n_epoch', type=int, default=100, help='num of epoches (default 100)')
# parser.add_argument('-bs', '--batch_size', type=int, default=100, help='batch size (default 100)')
# parser.add_argument('-mlc', '--multilabel_clf', action='store_true', help='multilabel classification flag')
#
# args = parser.parse_args()

# autoencoder
train_doc_codes = load_json(
    '/home/sgnbx/Downloads/projects/KATE-master/output/output.train')
# train_doc_labels = load_json('/home/sgnbx/Downloads/projects/KATE-master/output/output.train')
val_doc_codes = load_json(
    '/home/sgnbx/Downloads/projects/KATE-master/output/output.val')
# val_doc_labels = load_json('/home/sgnbx/Downloads/projects/KATE-master/output/output.val')

# test_doc_codes = load_json(args.test_doc_codes)
# test_doc_labels = load_json(args.test_doc_labels)
X_train = np.r_[train_doc_codes.values()]
print X_train.shape
train_labels = generate_20news_doc_labels(
    train_doc_codes.keys(),
    '/home/sgnbx/Downloads/projects/KATE-master/output/train.labels')
Y_train = [train_labels[i] for i in train_doc_codes]
print Y_train
X_val = np.r_[val_doc_codes.values()]