def generate_summary(): wordCount = WordCounter(lower_case=FLAGS.lower_case) rand = np.random.RandomState(seed=8888) if FLAGS.merged_summary_vocab_freqs_file is None: if FLAGS.summary_vocab_freqs_file is None: logger.info("generating summary vocabulary...") wordCount.fit(glob.glob(osp.join(FLAGS.data_dir, "train/*.txt")), doc_count_threshold=FLAGS.doc_count_threshold) logger.info("saving summary vocabulary...") with open(osp.join(FLAGS.output_dir, "summary_word_freqs.pickle"), "wb") as f: pickle.dump(wordCount.words_list, f) else: logger.info("loading summary vocabulary...") with open(FLAGS.summary_vocab_freqs_file, "rb") as f: wordCount.words_list = pickle.load(f) logger.info("summary vocabulary counts: %s; most frequent words: %s" % (len(wordCount.words_list), str(wordCount.words_list[:5]))) logger.info("loading classi vocabulary...") with open(FLAGS.vocab_freqs_file, "rb") as f: classi_vocabs = pickle.load(f) classiWordCount = WordCounter(lower_case=FLAGS.lower_case) classiWordCount.words_list = classi_vocabs logger.info("classi vocabulary counts: %s; most frequent words: %s" % (len(classiWordCount.words_list), str(classiWordCount.words_list[:5]))) logger.info("merging summary vocabs and classi vocabs..") intersect_count, range_intersect_count = classiWordCount.merge( wordCount, max_intersect_wordnum=FLAGS.max_words) print("intersect_count: %s, range_intersect_count: %s" % (intersect_count, range_intersect_count)) wordCount = classiWordCount else: with open(FLAGS.merged_summary_vocab_freqs_file, "rb") as f: wordCount.words_list = pickle.load(f) logger.info( "merged summary vocabulary counts: %s; most frequent words: %s" % (len(wordCount.words_list), str(wordCount.words_list[:5]))) if FLAGS.merged_summary_vocab_freqs_file is None: logger.info("saving merged summary vocabulary...") with open( osp.join(FLAGS.output_dir, "merged_summary_word_freqs.pickle"), "wb") as f: pickle.dump(wordCount.words_list, f) with open( osp.join(FLAGS.output_dir, "total_%s_words" % len(wordCount.words_list)), "w"): pass # transform words logger.info("transforming words...") logger.info("transforming training article words...") training_article = _load_words("train/train.article.txt", wordCount, FLAGS.max_words) logger.info("transforming training title words...") training_title = _load_words("train/train.title.txt", wordCount, FLAGS.max_words) logger.info("transforming valid article words...") valid_article = _load_words("train/valid.article.filter.txt", wordCount, FLAGS.max_words) logger.info("transforming valid title words...") valid_title = _load_words("train/valid.title.filter.txt", wordCount, FLAGS.max_words) training_article = training_article + valid_article training_title = training_title + valid_title # sample article_pos_sample_index = rand.choice(len(training_article), 1)[0] title_pos_sample_index = rand.choice(len(training_title), 1)[0] logger.info("training_article sample: %s" % training_article[article_pos_sample_index]) logger.info("training_title sample: %s" % training_title[title_pos_sample_index]) # save logger.info("saving...") pickle_data = { "training_article": training_article, "training_title": training_title } with open(osp.join(FLAGS.output_dir, "summary_dataset.pickle"), "wb") as f: pickle.dump(pickle_data, f)
def generate_imdb(): wordCount = WordCounter(lower_case=FLAGS.lower_case) rand = np.random.RandomState(seed=8888) # vocab frequences if FLAGS.vocab_freqs_file is None: logger.info("generating imdb vocabulary...") wordCount.fit(glob.glob( osp.join(FLAGS.data_dir, "train_test_unsup/*.txt")), doc_count_threshold=FLAGS.doc_count_threshold) logger.info("saving imdb vocabulary...") with open(osp.join(FLAGS.output_dir, "imdb_word_freqs.pickle"), "wb") as f: pickle.dump(wordCount.words_list, f) else: logger.info("loading imdb vocabulary...") with open(FLAGS.vocab_freqs_file, "rb") as f: wordCount.words_list = pickle.load(f) logger.info("vocabulary counts: %s; most frequent words: %s" % (len(wordCount.words_list), str(wordCount.words_list[:5]))) # transform words logger.info("transforming words...") logger.info("transforming training-pos words...") training_pos_data = wordCount.transform( [osp.join(FLAGS.data_dir, "train_pos.txt")], max_words=FLAGS.max_words, include_unk=FLAGS.include_unk) n_samples_training_pos = len(training_pos_data) min_seqlen_training_pos = min(map(len, training_pos_data)) max_seqlen_training_pos = max(map(len, training_pos_data)) logger.info( "total number of training_pos: %s; min_seqlen in training_pos_data: %s; max_seqlen in training_pos_data: %s" % (n_samples_training_pos, min_seqlen_training_pos, max_seqlen_training_pos)) logger.info("transforming training-neg words...") training_neg_data = wordCount.transform( [osp.join(FLAGS.data_dir, "train_neg.txt")], max_words=FLAGS.max_words, include_unk=FLAGS.include_unk) n_samples_training_neg = len(training_neg_data) min_seqlen_training_neg = min(map(len, training_neg_data)) max_seqlen_training_neg = max(map(len, training_neg_data)) logger.info( "total number of training_neg: %s; min_seqlen in training_neg_data: %s; max_seqlen in training_neg_data: %s" % (n_samples_training_neg, min_seqlen_training_neg, max_seqlen_training_neg)) logger.info("transforming testing-pos words...") testing_pos_data = wordCount.transform( [osp.join(FLAGS.data_dir, "test_pos.txt")], max_words=FLAGS.max_words, include_unk=FLAGS.include_unk) n_samples_testing_pos = len(testing_pos_data) min_seqlen_testing_pos = min(map(len, testing_pos_data)) max_seqlen_testing_pos = max(map(len, testing_pos_data)) logger.info( "total number of testing_pos: %s; min_seqlen in testing_pos_data: %s; max_seqlen in testing_pos_data: %s" % (n_samples_testing_pos, min_seqlen_testing_pos, max_seqlen_testing_pos)) logger.info("transforming testing-neg words...") testing_neg_data = wordCount.transform( [osp.join(FLAGS.data_dir, "test_neg.txt")], max_words=FLAGS.max_words, include_unk=FLAGS.include_unk) n_samples_testing_neg = len(testing_neg_data) min_seqlen_testing_neg = min(map(len, testing_neg_data)) max_seqlen_testing_neg = max(map(len, testing_neg_data)) logger.info( "total number of testing_neg: %s; min_seqlen in testing_neg_data: %s; max_seqlen in testing_neg_data: %s" % (n_samples_testing_neg, min_seqlen_testing_neg, max_seqlen_testing_neg)) logger.info("transforming train_unsup words...") unsup_data = wordCount.transform( [osp.join(FLAGS.data_dir, "train_unsup.txt")], max_words=FLAGS.max_words, include_unk=FLAGS.include_unk) n_samples_unsup = len(unsup_data) min_seqlen_unsup = min(map(len, unsup_data)) max_seqlen_unsup = max(map(len, unsup_data)) logger.info( "total number of unsup: %s; min_seqlen in unsup_data: %s; max_seqlen in unsup_data: %s" % (n_samples_unsup, min_seqlen_unsup, max_seqlen_unsup)) # [[0], [1], ...] training_pos_label = np.ones((len(training_pos_data), 1), dtype=np.int8) training_neg_label = np.zeros((len(training_neg_data), 1), dtype=np.int8) testing_pos_label = np.ones((len(testing_pos_data), 1), dtype=np.int8) testing_neg_label = np.zeros((len(testing_neg_data), 1), dtype=np.int8) # shuffle logger.info("shuffling docs...") rand.shuffle(training_pos_data) rand.shuffle(training_neg_data) rand.shuffle(testing_pos_data) rand.shuffle(testing_neg_data) rand.shuffle(unsup_data) # sample training_pos_sample_index = rand.choice(n_samples_training_pos, 1)[0] testing_pos_sample_index = rand.choice(n_samples_testing_pos, 1)[0] logger.info("training_pos sample: %s" % training_pos_data[training_pos_sample_index]) logger.info("testing_pos sample: %s" % testing_pos_data[testing_pos_sample_index]) # save logger.info("saving...") pickle_data = { "training_pos_data": training_pos_data, "training_neg_data": training_neg_data, "testing_pos_data": testing_pos_data, "testing_neg_data": testing_neg_data, "unsup_data": unsup_data, "training_pos_label": training_pos_label, "training_neg_label": training_neg_label, "testing_pos_label": testing_pos_label, "testing_neg_label": testing_neg_label } with open(osp.join(FLAGS.output_dir, "imdb_dataset.pickle"), "wb") as f: pickle.dump(pickle_data, f)