def get_all_topk_qlist(corpus_dir): parallel_list = [ "0-1000000", "1000000-2000000", "2000000-3000000", "3000000-4000000", "4000000-5000000", "5000000-6000000", "6000000-7000000", "7000000-8000000", "8000000-9000000", "9000000-10000000", "10000000-11000000", "11000000-12000000", "12000000-13000000", "13000000-14000000", "14000000-15000000", "15000000-16000000", "16000000-None" ] qlist = list() for p in parallel_list: target_corpus_fpath = corpus_dir + os.sep + "_2_corpus-without-Raretag-%s.pkl" % p for q in load_pickle(target_corpus_fpath): yy = int(q.creation_date.split('-')[0]) if yy >= 2014: qlist.append(q) print("# all qlist = %s" % len(qlist), get_current_time()) return qlist
ts_dir = dataset_dir + os.sep + "ts%s" % ts ts_parallel_dir = ts_dir + os.sep + "parallel" sample_K = "test100000" sample_K_dir = ts_dir + os.sep + "data-%s" % sample_K sample_K_train_dir = sample_K_dir + os.sep + "train" corpus_dir = os.path.join(sample_K_dir, "corpus") if not os.path.exists(corpus_dir): os.mkdir(corpus_dir) # use training data qlist = list() for f in os.listdir(sample_K_train_dir): fpath = os.path.join(sample_K_train_dir, f) qlist += load_pickle(fpath) print("# qlist = %s" % len(qlist)) title_corpus_fpath = os.path.join(corpus_dir, "title_corpus.txt") if not os.path.exists(title_corpus_fpath): build_corpus(qlist, ["title"], title_corpus_fpath) else: print("title Corpus already exist.") desc_text_corpus_fpath = os.path.join(corpus_dir, "desc_text_corpus.txt") if not os.path.exists(desc_text_corpus_fpath): build_corpus(qlist, ["desc_text"], desc_text_corpus_fpath) else: print("desc_text Corpus already exist.") desc_code_corpus_fpath = os.path.join(corpus_dir, "desc_code_corpus.txt")
ts_dir = dataset_dir + os.sep + "ts%s" % ts # sample_K dir sample_K = "test100000" sample_K_dir = ts_dir + os.sep + "data-%s" % sample_K vocab_dir = os.path.join(sample_K_dir, "vocab") app_name = "tagrcnn" app_dir = os.path.join(sample_K_dir, "approach", app_name) snapshot_dir = os.path.join(app_dir, "snapshot") if not os.path.exists(snapshot_dir): print("snapshot %s not exist!" % snapshot_dir) exit() # input files text_vocab_fpath = os.path.join(vocab_dir, "title_desc_text_vocab.pkl") text_vocab = load_pickle(text_vocab_fpath) text_vocab = vocab_to_index_dict(vocab=text_vocab, ifpad=True) tag_vocab_fpath = os.path.join(vocab_dir, "tag_vocab.pkl") tag_vocab = load_pickle(tag_vocab_fpath) tag_vocab = vocab_to_index_dict(vocab=tag_vocab, ifpad=False) # basic path test_dir = sample_K_dir + os.sep + "test" print("Setting:\ntask : %s\ndataset : %s\nts : %s\n" % (task, dataset, ts)) snapshot_name = "04-01-19_14-41-13" checkpoint_dir = os.path.join(app_dir, "snapshot", snapshot_name, "checkpoints") if not os.path.exists(checkpoint_dir): print("check point dir %s not exist!" % checkpoint_dir) exit()
help='comma-separated kernel size to use for convolution') ############################################################################ args = parser.parse_args() # initial # len # len_dict = load_pickle(len_dict_fpath) len_dict = dict() len_dict["max_title_len"] = 100 len_dict["max_desc_text_len"] = 1000 len_dict["max_desc_code_len"] = 1000 args.max_title_len = len_dict["max_title_len"] args.max_desc_text_len = len_dict["max_desc_text_len"] args.max_desc_code_len = len_dict["max_desc_code_len"] # title vocab title_vocab = load_pickle(title_vocab_fpath) title_vocab = vocab_to_index_dict(vocab=title_vocab, ifpad=True) args.title_embed_num = len(title_vocab) # desc_text vocab desc_text_vocab = load_pickle(desc_text_vocab_fpath) desc_text_vocab = vocab_to_index_dict(vocab=desc_text_vocab, ifpad=True) args.desc_text_embed_num = len(desc_text_vocab) # desc_code_vocab desc_code_vocab = load_pickle(desc_code_vocab_fpath) desc_code_vocab = vocab_to_index_dict(vocab=desc_code_vocab, ifpad=True) args.desc_code_embed_num = len(desc_code_vocab) # tag vocab tag_vocab = load_pickle(tag_vocab_fpath)
# initial len_dict_fpath = os.path.join(vocab_dir, "len.pkl") title_vocab_fpath = os.path.join(vocab_dir, "title_vocab.pkl") desc_text_vocab_fpath = os.path.join(vocab_dir, "desc_text_vocab.pkl") desc_code_vocab_fpath = os.path.join(vocab_dir, "desc_code_vocab.pkl") tag_vocab_fpath = os.path.join(vocab_dir, "tag_vocab.pkl") # len # len_dict = load_pickle(len_dict_fpath) len_dict = dict() len_dict["max_title_len"] = 100 len_dict["max_desc_text_len"] = 1000 len_dict["max_desc_code_len"] = 1000 # title vocab title_vocab = load_pickle(title_vocab_fpath) title_vocab = vocab_to_index_dict(vocab=title_vocab, ifpad=True) # desc_text vocab desc_text_vocab = load_pickle(desc_text_vocab_fpath) desc_text_vocab = vocab_to_index_dict(vocab=desc_text_vocab, ifpad=True) # desc_code_vocab desc_code_vocab = load_pickle(desc_code_vocab_fpath) desc_code_vocab = vocab_to_index_dict(vocab=desc_code_vocab, ifpad=True) # tag vocab tag_vocab = load_pickle(tag_vocab_fpath) tag_vocab = vocab_to_index_dict(vocab=tag_vocab, ifpad=False) # predict
# basic path print("Setting:\ntask : %s\ndataset : %s\nts : %s\n" % (task, dataset, ts)) ################################################################################# # predict test_dir = os.path.join(simple_K_dir, "test") # get sample test data sample_size = 20000 sample_cnt = 10 all_test_data = list() sample_test_data_dir = os.path.join(simple_K_dir, "sample_test") if not os.path.exists(sample_test_data_dir): os.mkdir(sample_test_data_dir) elif len(os.listdir(sample_test_data_dir)) > 0: print("sample test data is not empty!") exit() for f in sorted(os.listdir(test_dir)): test_data_fpath = os.path.join(test_dir, f) test_data = load_pickle(test_data_fpath) all_test_data += test_data for i in range(sample_cnt): sample_test_data = random.sample(all_test_data, sample_size) sample_test_data_fpath = os.path.join(sample_test_data_dir, "%s_sampled_test_data_%s.pkl" % (i, sample_size)) save_pickle(sample_test_data, sample_test_data_fpath) print("#sample test = %s" % len(sample_test_data))
# sample_K dir sample_K = "test100000" sample_K_dir = ts_dir + os.sep + "data-%s" % sample_K vocab_dir = os.path.join(sample_K_dir, "vocab") app_name = "tagrcnn" app_dir = os.path.join(sample_K_dir, "approach", app_name) if not os.path.exists(app_dir): os.mkdir(app_dir) snapshot_dir = os.path.join(app_dir, "snapshot") if not os.path.exists(snapshot_dir): os.mkdir(snapshot_dir) # input files text_vocab_fpath = os.path.join(vocab_dir, "title_desc_text_vocab.pkl") text_vocab = load_pickle(text_vocab_fpath) text_vocab = vocab_to_index_dict(vocab=text_vocab, ifpad=True) tag_vocab_fpath = os.path.join(vocab_dir, "tag_vocab.pkl") tag_vocab = load_pickle(tag_vocab_fpath) tag_vocab = vocab_to_index_dict(vocab=tag_vocab, ifpad=False) # basic path train_dir = sample_K_dir + os.sep + "train" print("Setting:\ntask : %s\ndataset : %s\nts : %s\n" % (task, dataset, ts)) ################################################################################# # Parameters # ================================================== # Data loading params tf.flags.DEFINE_float("dev_sample_percentage", .1,