def main(): parser = argparse.ArgumentParser() parser.add_argument('train_doc_codes', type=str, help='path to the train doc code file') parser.add_argument('val_doc_codes', type=str, help='path to the valid doc code file') parser.add_argument('out_dir', type=str, help='path to the output dir') args = parser.parse_args() train_doc_codes_path = args.train_doc_codes train_doc_codes = load_json(train_doc_codes_path) val_doc_codes = load_json(args.val_doc_codes) # import pdb;pdb.set_trace() train_doc_codes.update(val_doc_codes) out_dir = args.out_dir dump_json( train_doc_codes, os.path.join(out_dir, 'new_' + os.path.basename(train_doc_codes_path))) import pdb pdb.set_trace()
def train(args): vocab = load_json(args.vocab) # import pdb;pdb.set_trace() # load corpus corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True) # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True) # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True) # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True) corpus_iter = lambda: (TaggedDocument( [word for word in sentence if word in vocab], tag) for sentence, tag in corpus) d2v = MyDoc2Vec(args.n_dim, window=args.window_size, \ negative=args.negative, epoches=args.n_epoch, dm_concat=1) start = timeit.default_timer() d2v.train(corpus_iter) print 'runtime: %ss' % (timeit.default_timer() - start) save_doc2vec(d2v.model, args.save_model) import pdb pdb.set_trace()
def get_words(args): corpus = load_corpus(args.input_corpus) filename_corpus_dict = corpus['docs'] vocab_dict = corpus['vocab'] # we have to revort the dict dictionary = dict((v,k) for k, v in vocab_dict.iteritems()) filename_label_dict = load_json(args.input_label) print 'Finish loading data' label_vocab_dict = {} # start counting words for filename in filename_corpus_dict: vocab_num_dict = filename_corpus_dict[filename] label = filename_label_dict[filename] try: label_vocab_dict[label] except: label_vocab_dict[label] = {} for vocab in vocab_num_dict: num = vocab_num_dict[vocab] # print 'If num is a int? : ', isinstance(num, int) try: label_vocab_dict[label][vocab] += num except: label_vocab_dict[label][vocab] = num print 'Finish counting word frequence' label_topword_dict = {} label_num = len(label_topword_dict) print 'Label num is ', label_num topn = args.topn for label in label_vocab_dict: vocab_num_dict = label_vocab_dict[label] label_topword_dict[label] = sorted(vocab_num_dict, key = vocab_num_dict.__getitem__, reverse = True)[:topn] print 'Finish sorting the top n word' dump_json(label_topword_dict, args.output_json) print 'Finish write the json file' for label in label_topword_dict: filename_o = args.output_dir + 'label-' + str(label) + '.txt' print 'filename =' , filename_o file_o = open(filename_o, 'w') for word_index in label_topword_dict[label]: # print 'Is word_index a int:', isinstance(word_index, int) text = dictionary[int(word_index)] text += '\n' file_o.write(text.encode('utf-8')) file_o.close() print 'Finish writing files!'
def main(): parser = argparse.ArgumentParser() parser.add_argument('-l', '--label', type=str, required=True, help='path to the input label file') parser.add_argument('-c', '--corpus', type=str, required=True, help='path to the constructed corpus file') parser.add_argument('-o', '--output', type=str, required=True, help='path to the output file') args = parser.parse_args() extract_labels( load_json(args.corpus)['docs'], load_json(args.label), args.output)
def main(): parser T= argparse.ArgumentParser() parser.add_argument('-i', '--input', type=str, required=True, help='path to the input corpus dir') parser.add_argument('-o', '--output', type=str, default='./', help='path to the output dir') parser.add_argument('-wl', '--whitelist', type=str, help='path to the whitelist file') args = parser.parse_args() if args.whitelist: white_list = load_json(args.whitelist) else: white_list = None xml2text(args.input, args.output, white_list)
def test(args): vocab = load_json(args.vocab) # load corpus corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True) # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True) # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True) # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True) corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus) d2v = load_doc2vec(args.load_model) doc_codes = predict(d2v, corpus_iter) dump_json(doc_codes, args.output) import pdb;pdb.set_trace()
def main(): parser = argparse.ArgumentParser() parser.add_argument('corpus', type=str, help='path to the corpus file') parser.add_argument('labels', type=str, help='path to the labels file') parser.add_argument('-bs', '--batch_size', type=int, default=100, help='batch size (default 100)') parser.add_argument('out_dir', type=str, help='path to the output dir') args = parser.parse_args() corpus = load_corpus(args.corpus) doc_labels = load_json(args.labels) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_names = docs.keys() X_docs = [doc2vec(x, n_vocab) for x in docs.values()] out_dir = args.out_dir # attributes attrs = zip(*sorted(vocab.items(), key=lambda d: [1]))[0] dump_pickle(attrs, os.path.join(out_dir, 'attributes.p')) # batches bs = args.batch_size batches = [bs * (x + 1) for x in range(int(len(docs) / bs) - 1)] batches.append(len(docs)) dump_pickle(batches, os.path.join(out_dir, 'batches.p')) # bow_batch_x for i in range(len(batches)): dump_pickle(X_docs[batches[i - 1] if i > 0 else 0:batches[i]], os.path.join(out_dir, 'bow_batch_%s.p' % batches[i])) # # docs_names_batch_x # for i in range(len(batches)): # dump_pickle(doc_names[batches[i - 1] if i > 0 else 0: batches[i]], os.path.join(out_dir, 'docs_names_batch_%s.p' % batches[i])) # class_indices_batch_x for i in range(len(batches)): data = [ doc_labels[doc_names[idx]] for idx in range(batches[i - 1] if i > 0 else 0, batches[i]) ] dump_pickle( data, os.path.join(out_dir, 'class_indices_batch_%s.p' % batches[i])) import pdb pdb.set_trace()
def train(args): vocab = load_json(args.vocab) # import pdb;pdb.set_trace() # load corpus corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=False) # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=False) # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=False) # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=False) # print len([1 for x in corpus]) corpus_iter = lambda: ([word for word in sentence if word in vocab] for sentence in corpus) w2v = Word2Vec(args.n_dim, window=args.window_size, \ negative=args.negative, epoches=args.n_epoch) start = timeit.default_timer() w2v.train(corpus_iter) print 'runtime: %ss' % (timeit.default_timer() - start) save_w2v(w2v.model, args.save_model) import pdb pdb.set_trace()
def main(): parser = argparse.ArgumentParser() parser.add_argument('train_doc_codes', type=str, help='path to the train doc codes file') parser.add_argument('train_doc_labels', type=str, help='path to the train doc labels file') parser.add_argument('test_doc_codes', type=str, help='path to the test doc codes file') parser.add_argument('test_doc_labels', type=str, help='path to the test doc labels file') parser.add_argument('-nv', '--n_val', type=int, default=1000, help='size of validation set (default 1000)') parser.add_argument('-ne', '--n_epoch', type=int, default=100, help='num of epoches (default 100)') parser.add_argument('-bs', '--batch_size', type=int, default=100, help='batch size (default 100)') parser.add_argument('-cv', '--cross_validation', type=int, help='k-fold cross validation') args = parser.parse_args() # autoencoder train_doc_codes = load_json(args.train_doc_codes) train_doc_labels = load_json(args.train_doc_labels) test_doc_codes = load_json(args.test_doc_codes) test_doc_labels = load_json(args.test_doc_labels) X_train = np.r_[train_doc_codes.values()] Y_train = np.array([train_doc_labels[i] for i in train_doc_codes]) X_test = np.r_[test_doc_codes.values()] Y_test = np.array([test_doc_labels[i] for i in test_doc_codes]) # # DBN # X_train = np.array(load_pickle(args.train_doc_codes)) # Y_train = load_pickle(args.train_doc_labels) # X_test = np.array(load_pickle(args.test_doc_codes)) # Y_test = load_pickle(args.test_doc_labels) seed = 7 np.random.seed(seed) if not args.cross_validation: val_idx = np.random.choice(range(X_train.shape[0]), args.n_val, replace=False) train_idx = list(set(range(X_train.shape[0])) - set(val_idx)) X_new_train = X_train[train_idx] Y_new_train = Y_train[train_idx] X_new_val = X_train[val_idx] Y_new_val = Y_train[val_idx] print 'train: %s, val: %s, test: %s' % ( X_new_train.shape[0], X_new_val.shape[0], X_test.shape[0]) results = neural_regression(X_new_train, Y_new_train, X_new_val, Y_new_val, \ X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed) print 'r2 score on test set: %s' % results else: X = np.concatenate((X_train, X_test), axis=0) Y = np.concatenate((Y_train, Y_test), axis=0) ss = ShuffleSplit(n_splits=int(args.cross_validation), test_size=X_test.shape[0], random_state=seed) results = [] for train_idx, test_idx in ss.split(X): val_idx = np.random.choice(train_idx, args.n_val, replace=False) new_train_idx = list(set(train_idx) - set(val_idx)) X_new_train = X[new_train_idx] Y_new_train = Y[new_train_idx] X_new_val = X[val_idx] Y_new_val = Y[val_idx] results.append(neural_regression(X_new_train, Y_new_train, X_new_val, Y_new_val, \ X[test_idx], Y[test_idx], nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)) mean = np.mean(results) std = np.std(results) print 'r2 score on %s cross validation: %s (%s)' % (int( args.cross_validation), mean, std) import pdb pdb.set_trace()
def main(): parser = argparse.ArgumentParser() parser.add_argument('train_doc_codes', type=str, help='path to the train doc codes file') parser.add_argument('train_doc_labels', type=str, help='path to the train doc codes file') parser.add_argument('val_doc_codes', type=str, help='path to the train doc codes file') parser.add_argument('val_doc_labels', type=str, help='path to the train doc labels file') parser.add_argument('test_doc_codes', type=str, help='path to the test doc codes file') parser.add_argument('test_doc_labels', type=str, help='path to the test doc labels file') parser.add_argument('-ne', '--n_epoch', type=int, default=100, help='num of epoches (default 100)') parser.add_argument('-bs', '--batch_size', type=int, default=100, help='batch size (default 100)') parser.add_argument('-mlc', '--multilabel_clf', action='store_true', help='multilabel classification flag') args = parser.parse_args() # autoencoder train_doc_codes = load_json(args.train_doc_codes) train_doc_labels = load_json(args.train_doc_labels) val_doc_codes = load_json(args.val_doc_codes) val_doc_labels = load_json(args.val_doc_labels) test_doc_codes = load_json(args.test_doc_codes) test_doc_labels = load_json(args.test_doc_labels) X_train = np.r_[train_doc_codes.values()] Y_train = [train_doc_labels[i] for i in train_doc_codes] X_val = np.r_[val_doc_codes.values()] Y_val = [val_doc_labels[i] for i in val_doc_codes] X_test = np.r_[test_doc_codes.values()] Y_test = [test_doc_labels[i] for i in test_doc_codes] # # DBN # X_train = np.array(load_pickle(args.train_doc_codes)) # Y_train = load_pickle(args.train_doc_labels) # X_val = np.array(load_pickle(args.val_doc_codes)) # Y_val = load_pickle(args.val_doc_labels) # X_test = np.array(load_pickle(args.test_doc_codes)) # Y_test = load_pickle(args.test_doc_labels) if args.multilabel_clf: encoder = MultiLabelBinarizer() encoder.fit(Y_train + Y_val + Y_test) Y_train = encoder.transform(Y_train) Y_val = encoder.transform(Y_val) Y_test = encoder.transform(Y_test) else: Y = Y_train + Y_val + Y_test n_train = len(Y_train) n_val = len(Y_val) n_test = len(Y_test) encoder = LabelEncoder() Y = np_utils.to_categorical(encoder.fit_transform(Y)) Y_train = Y[:n_train] Y_val = Y[n_train:n_train + n_val] Y_test = Y[-n_test:] seed = 7 print('train: %s, val: %s, test: %s' % (X_train.shape[0], X_val.shape[0], X_test.shape[0])) if args.multilabel_clf: results = multilabel_classifier(X_train, Y_train, X_val, Y_val, \ X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed) print('f1 score on test set: macro_f1: %s, micro_f1: %s' % tuple(results)) else: results = multiclass_classifier(X_train, Y_train, X_val, Y_val, \ X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed) print('acc on test set: %s' % results) import pdb pdb.set_trace()
def get_word_relationship(args): corpus = load_corpus(args.input_corpus) doc_vec_dict = corpus['docs'] vocab_dict = corpus['vocab'] print 'Load corpus' # we have to revort the dict dictionary = dict((v,k) for k, v in vocab_dict.iteritems()) # Here the input top words path is the json file of the label-topwords_ls # should be a dict, each key is a label and its value is the list of top words top_words_path = args.input_topwords label_topwordls = load_json(top_words_path) print 'Load top words of each label' label_topwords_vocabnum_dict = {} label_topwordindexls_dict = {} for label in label_topwordls: label_topwords_vocabnum_dict[label] = {} topwords_index_ls = [] for word in label_topwordls[label]: topwords_index_ls.append(word) label_topwords_vocabnum_dict[label][word] = {} label_topwordindexls_dict[label] = topwords_index_ls print 'Finish change words into index' # in order to save memory and speed it up, I only calculate the word-words frequency of those # in the top word list for label in label_topwordindexls_dict: print 'Doing label', str(label) topwords_idx_set = set(label_topwordindexls_dict[label]) for filename in doc_vec_dict: word_vec_dict = doc_vec_dict[filename] result_word_ls = get_word_list(word_vec_dict, topwords_idx_set) for word in result_word_ls: for doc_word in word_vec_dict: try: label_topwords_vocabnum_dict[label][word][doc_word] += word_vec_dict[doc_word] except: label_topwords_vocabnum_dict[label][word][doc_word] = word_vec_dict[doc_word] print 'Finish building the dict of label-topwords-words-num!' # now we should get the top of words topn = args.topn # it is a dict-dict-ls ({label:{words:[top_relative words]}}) label_topwords_relativewords = {} for label in label_topwords_vocabnum_dict: label_topwords_relativewords[label] = {} for word in label_topwords_vocabnum_dict[label]: vocab_num_dict = label_topwords_vocabnum_dict[label][word] label_topwords_relativewords[label][word] = sorted(vocab_num_dict, key=vocab_num_dict.__getitem__, reverse = True)[:topn] print 'Finish sorting the top n word' dump_json(label_topwords_relativewords, args.output_json) print 'Finish write the json file' for label in label_topwords_relativewords: filename_o = args.output_dir + 'label-' + str(label) + '.txt' print 'filename =' , filename_o file_o = open(filename_o, 'w') for word_index in label_topwords_relativewords[label]: # print 'Is word_index a int:', isinstance(word_index, int) text = dictionary[int(word_index)] text += ': ' for top_relative_wordidx in label_topwords_relativewords[label][word_index]: text += dictionary[int(top_relative_wordidx)] text += ', ' text += '\n' file_o.write(text.encode('utf-8')) file_o.close() print 'Finish writing files!'
def main(): parser = argparse.ArgumentParser() parser.add_argument('doc_codes_file', type=str, help='path to the input corpus file') parser.add_argument('doc_labels_file', type=str, help='path to the output doc codes file') parser.add_argument('cmd', choices=['pca', 'tsne'], help='plot cmd') parser.add_argument('-o', '--output', type=str, default='out.png', help='path to the output file') args = parser.parse_args() cmd = args.cmd.lower() # classes_to_visual = ["rec.sport.hockey", "comp.graphics", "sci.crypt", \ # "soc.religion.christian", "talk.politics.mideast", \ # "talk.politics.guns"] # classes_to_visual = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', # 'comp.sys.mac.hardware', 'comp.windows.x'] # classes_to_visual = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] doc_codes = load_json(args.doc_codes_file) # doc_labels = load_json(args.doc_labels_file) # 20news # if cmd == 'pca': # visualize_pca_2d(doc_codes, doc_labels, classes_to_visual, args.output) # elif cmd == 'tsne': # plot_tsne(doc_codes, doc_labels, classes_to_visual, args.output) # # 8k # classes_to_visual = ["1", "2", "3", "4", "5", "7", "8"] # for k in doc_labels: # doc_labels[k] = doc_labels[k].split('.')[0] # 10k # classes_to_visual = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"] # for k in doc_labels: # doc_labels[k] = ''.join([y for y in list(doc_labels[k]) if y.isdigit()]) # bank_topic import numpy as np doc_labels = {} bank_year = True if not bank_year: with open(args.doc_labels_file, 'r') as f: for each in f: tmp = each.strip().split(',') doc_labels[tmp[0]] = 'NF' if tmp[1] == 'NA' else 'F' else: safe_threshold = 0 bank_record = {} with open(args.doc_labels_file, 'r') as f: for each in f: tmp = each.strip().split(',') bank_record[tmp[0]] = tmp[1] for key in doc_codes: bank, year = key.split('_') doc_labels[key] = 'NF' if bank_record[bank] == 'NA' or (int(bank_record[bank]) - safe_threshold > int(year)) else 'F' # dump_json(doc_labels, 'bank_year.labels') classes_to_visual = ["NF", "F"] maker_size = [10, 120] opaque = [.2, 1] if cmd == 'pca': visualize_pca_3d(doc_codes, doc_labels, classes_to_visual, args.output, maker_size, opaque) elif cmd == 'tsne': plot_tsne_3d(doc_codes, doc_labels, classes_to_visual, args.output)
def main(): parser = argparse.ArgumentParser() parser.add_argument('train_doc_codes', type=str, help='path to the train doc codes file') parser.add_argument('train_doc_labels', type=str, help='path to the train doc labels file') parser.add_argument('test_doc_codes', type=str, help='path to the test doc codes file') parser.add_argument('test_doc_labels', type=str, help='path to the test doc labels file') parser.add_argument('-nv', '--n_val', type=int, default=1000, help='size of validation set (default 1000)') parser.add_argument( '-qi', '--query_info', type=str, help='path to the query corpus (for geting doc length info)') parser.add_argument('-ml', '--multilabel', action='store_true', help='multilabel flag') args = parser.parse_args() # autoencoder train_doc_codes = load_json(args.train_doc_codes) train_doc_labels = load_json(args.train_doc_labels) test_doc_codes = load_json(args.test_doc_codes) test_doc_labels = load_json(args.test_doc_labels) X_train = np.r_[train_doc_codes.values()] Y_train = np.array([train_doc_labels[i] for i in train_doc_codes]) X_test = np.r_[test_doc_codes.values()] Y_test = np.array([test_doc_labels[i] for i in test_doc_codes]) # # DocNADE # train_doc_codes = load_json(args.train_doc_codes) # train_doc_labels = load_json(args.train_doc_labels) # test_doc_codes = load_json(args.test_doc_codes) # test_doc_labels = load_json(args.test_doc_labels) # X_train = [] # for each in train_doc_codes.values(): # X_train.append([float(x) for x in each]) # X_test = [] # for each in test_doc_codes.values(): # X_test.append([float(x) for x in each]) # X_train = np.r_[X_train] # Y_train = np.array([train_doc_labels[i] for i in train_doc_codes]) # X_test = np.r_[X_test] # Y_test = np.array([test_doc_labels[i] for i in test_doc_codes]) # # DBN # X_train = np.array(load_marshal(args.train_doc_codes)) # Y_train = np.array(load_marshal(args.train_doc_labels)) # X_test = np.array(load_marshal(args.test_doc_codes)) # Y_test = np.array(load_marshal(args.test_doc_labels)) seed = 7 np.random.seed(seed) val_idx = np.random.choice(range(X_train.shape[0]), args.n_val, replace=False) train_idx = list(set(range(X_train.shape[0])) - set(val_idx)) X_new_train = X_train[train_idx] Y_new_train = Y_train[train_idx] X_new_val = X_train[val_idx] Y_new_val = Y_train[val_idx] print 'train: %s, val: %s, test: %s' % ( X_new_train.shape[0], X_new_val.shape[0], X_test.shape[0]) results = retrieval(X_new_train, Y_new_train, X_new_val, Y_new_val,\ fractions=[0.001], multilabel=args.multilabel) print 'precision on val set: %s' % results if not args.query_info: results = retrieval(X_train, Y_train, X_test, Y_test,\ fractions=[0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0], multilabel=args.multilabel) else: query_docs = load_corpus(args.query_info)['docs'] len_test = [sum(query_docs[i].values()) for i in test_doc_codes] results = retrieval_by_doclength(X_train, Y_train, X_test, Y_test, len_test, fraction=0.001, multilabel=args.multilabel) print 'precision on test set: %s' % results import pdb pdb.set_trace()
def main(): parser = argparse.ArgumentParser() parser.add_argument('train_doc_codes', type=str, help='path to the train doc codes file') parser.add_argument('train_doc_labels', type=str, help='path to the train doc labels file') parser.add_argument('test_doc_codes', type=str, help='path to the test doc codes file') parser.add_argument('test_doc_labels', type=str, help='path to the test doc labels file') parser.add_argument('-nv', '--n_val', type=int, default=1000, help='size of validation set (default 1000)') parser.add_argument('-ne', '--n_epoch', type=int, default=100, help='num of epoches (default 100)') parser.add_argument('-bs', '--batch_size', type=int, default=100, help='batch size (default 100)') parser.add_argument('-cv', '--cross_validation', type=int, help='k-fold cross validation') parser.add_argument('-mlc', '--multilabel_clf', action='store_true', help='multilabel classification flag') args = parser.parse_args() # autoencoder train_doc_codes = load_json(args.train_doc_codes) train_doc_labels = load_json(args.train_doc_labels) test_doc_codes = load_json(args.test_doc_codes) test_doc_labels = load_json(args.test_doc_labels) X_train = np.r_[train_doc_codes.values()] Y_train = [train_doc_labels[i] for i in train_doc_codes] X_test = np.r_[test_doc_codes.values()] Y_test = [test_doc_labels[i] for i in test_doc_codes] # # DBN # X_train = np.array(load_pickle(args.train_doc_codes)) # Y_train = load_pickle(args.train_doc_labels) # X_test = np.array(load_pickle(args.test_doc_codes)) # Y_test = load_pickle(args.test_doc_labels) # import pdb;pdb.set_trace() if args.multilabel_clf: encoder = MultiLabelBinarizer() encoder.fit(Y_train + Y_test) Y_train = encoder.transform(Y_train) Y_test = encoder.transform(Y_test) else: Y = Y_train + Y_test n_train = len(Y_train) n_test = len(Y_test) encoder = LabelEncoder() Y = np_utils.to_categorical(encoder.fit_transform(Y)) Y_train = Y[:n_train] Y_test = Y[-n_test:] seed = 7 np.random.seed(seed) if not args.cross_validation: val_idx = np.random.choice(range(X_train.shape[0]), args.n_val, replace=False) train_idx = list(set(range(X_train.shape[0])) - set(val_idx)) X_new_train = X_train[train_idx] Y_new_train = Y_train[train_idx] X_new_val = X_train[val_idx] Y_new_val = Y_train[val_idx] print 'train: %s, val: %s, test: %s' % ( X_new_train.shape[0], X_new_val.shape[0], X_test.shape[0]) if args.multilabel_clf: results = multilabel_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \ X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed) print 'f1 score on test set: macro_f1: %s, micro_f1: %s' % tuple( results) else: results = multiclass_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \ X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed) print 'acc on test set: %s' % results else: X = np.concatenate((X_train, X_test), axis=0) Y = np.concatenate((Y_train, Y_test), axis=0) ss = ShuffleSplit(n_splits=int(args.cross_validation), test_size=X_test.shape[0], random_state=seed) results = [] for train_idx, test_idx in ss.split(X): val_idx = np.random.choice(train_idx, args.n_val, replace=False) new_train_idx = list(set(train_idx) - set(val_idx)) X_new_train = X[new_train_idx] Y_new_train = Y[new_train_idx] X_new_val = X[val_idx] Y_new_val = Y[val_idx] if args.multilabel_clf: results.append(multilabel_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \ X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)) else: results.append(multiclass_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \ X[test_idx], Y[test_idx], nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)) if args.multilabel_clf: macro_f1, micro_f1 = zip(*results) macro_mean = np.mean(macro_f1) macro_std = np.std(macro_f1) micro_mean = np.mean(micro_f1) micro_std = np.std(micro_f1) print 'f1 score on %s-fold cross validation: macro_f1: %s (%s), micro_f1: %s (%s)' \ % (int(args.cross_validation), macro_mean, macro_std, micro_mean, micro_std) else: mean = np.mean(results) std = np.std(results) print 'acc on %s-fold cross validation: %s (%s)' % (int( args.cross_validation), mean, std) import pdb pdb.set_trace()
# def main(): # parser = argparse.ArgumentParser() # parser.add_argument('train_doc_codes', type=str, help='path to the train doc codes file') # parser.add_argument('train_doc_labels', type=str, help='path to the train doc codes file') # parser.add_argument('val_doc_codes', type=str, help='path to the train doc codes file') # parser.add_argument('val_doc_labels', type=str, help='path to the train doc labels file') # parser.add_argument('test_doc_codes', type=str, help='path to the test doc codes file') # parser.add_argument('test_doc_labels', type=str, help='path to the test doc labels file') # parser.add_argument('-ne', '--n_epoch', type=int, default=100, help='num of epoches (default 100)') # parser.add_argument('-bs', '--batch_size', type=int, default=100, help='batch size (default 100)') # parser.add_argument('-mlc', '--multilabel_clf', action='store_true', help='multilabel classification flag') # # args = parser.parse_args() # autoencoder train_doc_codes = load_json( '/home/sgnbx/Downloads/projects/KATE-master/output/output.train') # train_doc_labels = load_json('/home/sgnbx/Downloads/projects/KATE-master/output/output.train') val_doc_codes = load_json( '/home/sgnbx/Downloads/projects/KATE-master/output/output.val') # val_doc_labels = load_json('/home/sgnbx/Downloads/projects/KATE-master/output/output.val') # test_doc_codes = load_json(args.test_doc_codes) # test_doc_labels = load_json(args.test_doc_labels) X_train = np.r_[train_doc_codes.values()] print X_train.shape train_labels = generate_20news_doc_labels( train_doc_codes.keys(), '/home/sgnbx/Downloads/projects/KATE-master/output/train.labels') Y_train = [train_labels[i] for i in train_doc_codes] print Y_train X_val = np.r_[val_doc_codes.values()]