def __init__(self, base_dir, dataset): self.supported_dataset = ["imdb"] if dataset not in self.supported_dataset: raise Exception("Unsupported dataset: %s" % dataset) self.base_dir = base_dir self.dataset = dataset self.rand = np.random.RandomState(seed=8888) self.wordCounter = WordCounter()
import sys sys.path.insert(0, ".") from adversarial_net.models import LanguageModel from adversarial_net import arguments as flags from adversarial_net.preprocessing import WordCounter from adversarial_net import osp flags.add_argument( name="save_model_dir", argtype=str, default= "E:/kaggle/avito/imdb_testset/adversarial_net/model/lm_model/lm_model.ckpt" ) if __name__ == "__main__": vocab_freqs = WordCounter().load( osp.join(flags["lm_inputs"]["datapath"], "imdb_word_freqs.pickle")).most_common_freqs( flags["lm_sequence"]["vocab_size"]) flags.add_variable(name="vocab_freqs", value=vocab_freqs) lm_model = LanguageModel() lm_model.build() lm_model.fit(save_model_path=flags["save_model_dir"])
model_save_suffix = model_save_suffixes["train_summary_cl_model"] save_model_path = osp.join(flags.save_model_dir, model_save_suffix) generator_model = AdversarialSummaryModel() generator_model.build(eval_cl=True) generator_model.eval(save_model_path=save_model_path) # intersection count between classi word_freqs and summary word_freqs: {10000: 9652, 20000: 18673, 30000: 26590, 40000: 33259, 50000: 38737, 60000: 43262, 70000: 46964, 80000: 49788, 86934: 51515} if __name__ == "__main__": if flags.step == "train_summary_model" or flags.step == "eval_summary_model": inersect_count = [] vocab_freqs = WordCounter().load_and_merge( osp.join(flags["lm_inputs"]["datapath"], "%s_word_freqs.pickle" % flags["lm_inputs"]["dataset"]), osp.join(flags["lm_inputs"]["datapath"], "summary_word_freqs.pickle"), max_words=list(range(0, flags["inputs"]["vocab_size"], 10000))[1:] + [flags["inputs"]["vocab_size"]], return_cache=inersect_count).most_common_freqs( flags["lm_sequence"]["vocab_size"]) inersect_count = inersect_count[0] logger.info( "intersection count between classi word_freqs and summary word_freqs: %s" % inersect_count) else: vocab_freqs = WordCounter().load( osp.join(flags["lm_inputs"]["datapath"], "%s_word_freqs.pickle" % flags["lm_inputs"]["dataset"])).most_common_freqs( flags["lm_sequence"]["vocab_size"]) flags.add_variable(name="vocab_freqs", value=vocab_freqs) if flags.step == "train_lm_model":
def reload_word_counter(cls, vocab_abspath): wordCounter = WordCounter() with open(vocab_abspath, "rb") as f: wordCounter.words_list = pickle.load(f) return wordCounter
def generate_summary(): wordCount = WordCounter(lower_case=FLAGS.lower_case) rand = np.random.RandomState(seed=8888) if FLAGS.merged_summary_vocab_freqs_file is None: if FLAGS.summary_vocab_freqs_file is None: logger.info("generating summary vocabulary...") wordCount.fit(glob.glob(osp.join(FLAGS.data_dir, "train/*.txt")), doc_count_threshold=FLAGS.doc_count_threshold) logger.info("saving summary vocabulary...") with open(osp.join(FLAGS.output_dir, "summary_word_freqs.pickle"), "wb") as f: pickle.dump(wordCount.words_list, f) else: logger.info("loading summary vocabulary...") with open(FLAGS.summary_vocab_freqs_file, "rb") as f: wordCount.words_list = pickle.load(f) logger.info("summary vocabulary counts: %s; most frequent words: %s" % (len(wordCount.words_list), str(wordCount.words_list[:5]))) logger.info("loading classi vocabulary...") with open(FLAGS.vocab_freqs_file, "rb") as f: classi_vocabs = pickle.load(f) classiWordCount = WordCounter(lower_case=FLAGS.lower_case) classiWordCount.words_list = classi_vocabs logger.info("classi vocabulary counts: %s; most frequent words: %s" % (len(classiWordCount.words_list), str(classiWordCount.words_list[:5]))) logger.info("merging summary vocabs and classi vocabs..") intersect_count, range_intersect_count = classiWordCount.merge( wordCount, max_intersect_wordnum=FLAGS.max_words) print("intersect_count: %s, range_intersect_count: %s" % (intersect_count, range_intersect_count)) wordCount = classiWordCount else: with open(FLAGS.merged_summary_vocab_freqs_file, "rb") as f: wordCount.words_list = pickle.load(f) logger.info( "merged summary vocabulary counts: %s; most frequent words: %s" % (len(wordCount.words_list), str(wordCount.words_list[:5]))) if FLAGS.merged_summary_vocab_freqs_file is None: logger.info("saving merged summary vocabulary...") with open( osp.join(FLAGS.output_dir, "merged_summary_word_freqs.pickle"), "wb") as f: pickle.dump(wordCount.words_list, f) with open( osp.join(FLAGS.output_dir, "total_%s_words" % len(wordCount.words_list)), "w"): pass # transform words logger.info("transforming words...") logger.info("transforming training article words...") training_article = _load_words("train/train.article.txt", wordCount, FLAGS.max_words) logger.info("transforming training title words...") training_title = _load_words("train/train.title.txt", wordCount, FLAGS.max_words) logger.info("transforming valid article words...") valid_article = _load_words("train/valid.article.filter.txt", wordCount, FLAGS.max_words) logger.info("transforming valid title words...") valid_title = _load_words("train/valid.title.filter.txt", wordCount, FLAGS.max_words) training_article = training_article + valid_article training_title = training_title + valid_title # sample article_pos_sample_index = rand.choice(len(training_article), 1)[0] title_pos_sample_index = rand.choice(len(training_title), 1)[0] logger.info("training_article sample: %s" % training_article[article_pos_sample_index]) logger.info("training_title sample: %s" % training_title[title_pos_sample_index]) # save logger.info("saving...") pickle_data = { "training_article": training_article, "training_title": training_title } with open(osp.join(FLAGS.output_dir, "summary_dataset.pickle"), "wb") as f: pickle.dump(pickle_data, f)
def generate_imdb(): wordCount = WordCounter(lower_case=FLAGS.lower_case) rand = np.random.RandomState(seed=8888) # vocab frequences if FLAGS.vocab_freqs_file is None: logger.info("generating imdb vocabulary...") wordCount.fit(glob.glob( osp.join(FLAGS.data_dir, "train_test_unsup/*.txt")), doc_count_threshold=FLAGS.doc_count_threshold) logger.info("saving imdb vocabulary...") with open(osp.join(FLAGS.output_dir, "imdb_word_freqs.pickle"), "wb") as f: pickle.dump(wordCount.words_list, f) else: logger.info("loading imdb vocabulary...") with open(FLAGS.vocab_freqs_file, "rb") as f: wordCount.words_list = pickle.load(f) logger.info("vocabulary counts: %s; most frequent words: %s" % (len(wordCount.words_list), str(wordCount.words_list[:5]))) # transform words logger.info("transforming words...") logger.info("transforming training-pos words...") training_pos_data = wordCount.transform( [osp.join(FLAGS.data_dir, "train_pos.txt")], max_words=FLAGS.max_words, include_unk=FLAGS.include_unk) n_samples_training_pos = len(training_pos_data) min_seqlen_training_pos = min(map(len, training_pos_data)) max_seqlen_training_pos = max(map(len, training_pos_data)) logger.info( "total number of training_pos: %s; min_seqlen in training_pos_data: %s; max_seqlen in training_pos_data: %s" % (n_samples_training_pos, min_seqlen_training_pos, max_seqlen_training_pos)) logger.info("transforming training-neg words...") training_neg_data = wordCount.transform( [osp.join(FLAGS.data_dir, "train_neg.txt")], max_words=FLAGS.max_words, include_unk=FLAGS.include_unk) n_samples_training_neg = len(training_neg_data) min_seqlen_training_neg = min(map(len, training_neg_data)) max_seqlen_training_neg = max(map(len, training_neg_data)) logger.info( "total number of training_neg: %s; min_seqlen in training_neg_data: %s; max_seqlen in training_neg_data: %s" % (n_samples_training_neg, min_seqlen_training_neg, max_seqlen_training_neg)) logger.info("transforming testing-pos words...") testing_pos_data = wordCount.transform( [osp.join(FLAGS.data_dir, "test_pos.txt")], max_words=FLAGS.max_words, include_unk=FLAGS.include_unk) n_samples_testing_pos = len(testing_pos_data) min_seqlen_testing_pos = min(map(len, testing_pos_data)) max_seqlen_testing_pos = max(map(len, testing_pos_data)) logger.info( "total number of testing_pos: %s; min_seqlen in testing_pos_data: %s; max_seqlen in testing_pos_data: %s" % (n_samples_testing_pos, min_seqlen_testing_pos, max_seqlen_testing_pos)) logger.info("transforming testing-neg words...") testing_neg_data = wordCount.transform( [osp.join(FLAGS.data_dir, "test_neg.txt")], max_words=FLAGS.max_words, include_unk=FLAGS.include_unk) n_samples_testing_neg = len(testing_neg_data) min_seqlen_testing_neg = min(map(len, testing_neg_data)) max_seqlen_testing_neg = max(map(len, testing_neg_data)) logger.info( "total number of testing_neg: %s; min_seqlen in testing_neg_data: %s; max_seqlen in testing_neg_data: %s" % (n_samples_testing_neg, min_seqlen_testing_neg, max_seqlen_testing_neg)) logger.info("transforming train_unsup words...") unsup_data = wordCount.transform( [osp.join(FLAGS.data_dir, "train_unsup.txt")], max_words=FLAGS.max_words, include_unk=FLAGS.include_unk) n_samples_unsup = len(unsup_data) min_seqlen_unsup = min(map(len, unsup_data)) max_seqlen_unsup = max(map(len, unsup_data)) logger.info( "total number of unsup: %s; min_seqlen in unsup_data: %s; max_seqlen in unsup_data: %s" % (n_samples_unsup, min_seqlen_unsup, max_seqlen_unsup)) # [[0], [1], ...] training_pos_label = np.ones((len(training_pos_data), 1), dtype=np.int8) training_neg_label = np.zeros((len(training_neg_data), 1), dtype=np.int8) testing_pos_label = np.ones((len(testing_pos_data), 1), dtype=np.int8) testing_neg_label = np.zeros((len(testing_neg_data), 1), dtype=np.int8) # shuffle logger.info("shuffling docs...") rand.shuffle(training_pos_data) rand.shuffle(training_neg_data) rand.shuffle(testing_pos_data) rand.shuffle(testing_neg_data) rand.shuffle(unsup_data) # sample training_pos_sample_index = rand.choice(n_samples_training_pos, 1)[0] testing_pos_sample_index = rand.choice(n_samples_testing_pos, 1)[0] logger.info("training_pos sample: %s" % training_pos_data[training_pos_sample_index]) logger.info("testing_pos sample: %s" % testing_pos_data[testing_pos_sample_index]) # save logger.info("saving...") pickle_data = { "training_pos_data": training_pos_data, "training_neg_data": training_neg_data, "testing_pos_data": testing_pos_data, "testing_neg_data": testing_neg_data, "unsup_data": unsup_data, "training_pos_label": training_pos_label, "training_neg_label": training_neg_label, "testing_pos_label": testing_pos_label, "testing_neg_label": testing_neg_label } with open(osp.join(FLAGS.output_dir, "imdb_dataset.pickle"), "wb") as f: pickle.dump(pickle_data, f)