Beispiel #1
0
 def __init__(self, base_dir, dataset):
     self.supported_dataset = ["imdb"]
     if dataset not in self.supported_dataset:
         raise Exception("Unsupported dataset: %s" % dataset)
     self.base_dir = base_dir
     self.dataset = dataset
     self.rand = np.random.RandomState(seed=8888)
     self.wordCounter = WordCounter()
Beispiel #2
0
import sys
sys.path.insert(0, ".")
from adversarial_net.models import LanguageModel
from adversarial_net import arguments as flags
from adversarial_net.preprocessing import WordCounter
from adversarial_net import osp
flags.add_argument(
    name="save_model_dir",
    argtype=str,
    default=
    "E:/kaggle/avito/imdb_testset/adversarial_net/model/lm_model/lm_model.ckpt"
)

if __name__ == "__main__":
    vocab_freqs = WordCounter().load(
        osp.join(flags["lm_inputs"]["datapath"],
                 "imdb_word_freqs.pickle")).most_common_freqs(
                     flags["lm_sequence"]["vocab_size"])
    flags.add_variable(name="vocab_freqs", value=vocab_freqs)
    lm_model = LanguageModel()
    lm_model.build()
    lm_model.fit(save_model_path=flags["save_model_dir"])
Beispiel #3
0
    model_save_suffix = model_save_suffixes["train_summary_cl_model"]
    save_model_path = osp.join(flags.save_model_dir, model_save_suffix)
    generator_model = AdversarialSummaryModel()
    generator_model.build(eval_cl=True)
    generator_model.eval(save_model_path=save_model_path)


# intersection count between classi word_freqs and summary word_freqs: {10000: 9652, 20000: 18673, 30000: 26590, 40000: 33259, 50000: 38737, 60000: 43262, 70000: 46964, 80000: 49788, 86934: 51515}
if __name__ == "__main__":
    if flags.step == "train_summary_model" or flags.step == "eval_summary_model":
        inersect_count = []
        vocab_freqs = WordCounter().load_and_merge(
            osp.join(flags["lm_inputs"]["datapath"],
                     "%s_word_freqs.pickle" % flags["lm_inputs"]["dataset"]),
            osp.join(flags["lm_inputs"]["datapath"],
                     "summary_word_freqs.pickle"),
            max_words=list(range(0, flags["inputs"]["vocab_size"],
                                 10000))[1:] + [flags["inputs"]["vocab_size"]],
            return_cache=inersect_count).most_common_freqs(
                flags["lm_sequence"]["vocab_size"])
        inersect_count = inersect_count[0]
        logger.info(
            "intersection count between classi word_freqs and summary word_freqs: %s"
            % inersect_count)
    else:
        vocab_freqs = WordCounter().load(
            osp.join(flags["lm_inputs"]["datapath"], "%s_word_freqs.pickle" %
                     flags["lm_inputs"]["dataset"])).most_common_freqs(
                         flags["lm_sequence"]["vocab_size"])
    flags.add_variable(name="vocab_freqs", value=vocab_freqs)
    if flags.step == "train_lm_model":
Beispiel #4
0
 def reload_word_counter(cls, vocab_abspath):
     wordCounter = WordCounter()
     with open(vocab_abspath, "rb") as f:
         wordCounter.words_list = pickle.load(f)
     return wordCounter
Beispiel #5
0
def generate_summary():
    wordCount = WordCounter(lower_case=FLAGS.lower_case)
    rand = np.random.RandomState(seed=8888)
    if FLAGS.merged_summary_vocab_freqs_file is None:
        if FLAGS.summary_vocab_freqs_file is None:
            logger.info("generating summary vocabulary...")
            wordCount.fit(glob.glob(osp.join(FLAGS.data_dir, "train/*.txt")),
                          doc_count_threshold=FLAGS.doc_count_threshold)
            logger.info("saving summary vocabulary...")
            with open(osp.join(FLAGS.output_dir, "summary_word_freqs.pickle"),
                      "wb") as f:
                pickle.dump(wordCount.words_list, f)
        else:
            logger.info("loading summary vocabulary...")
            with open(FLAGS.summary_vocab_freqs_file, "rb") as f:
                wordCount.words_list = pickle.load(f)
        logger.info("summary vocabulary counts: %s; most frequent words: %s" %
                    (len(wordCount.words_list), str(wordCount.words_list[:5])))
        logger.info("loading classi vocabulary...")
        with open(FLAGS.vocab_freqs_file, "rb") as f:
            classi_vocabs = pickle.load(f)
            classiWordCount = WordCounter(lower_case=FLAGS.lower_case)
            classiWordCount.words_list = classi_vocabs
        logger.info("classi vocabulary counts: %s; most frequent words: %s" %
                    (len(classiWordCount.words_list),
                     str(classiWordCount.words_list[:5])))
        logger.info("merging summary vocabs and classi vocabs..")
        intersect_count, range_intersect_count = classiWordCount.merge(
            wordCount, max_intersect_wordnum=FLAGS.max_words)
        print("intersect_count: %s, range_intersect_count: %s" %
              (intersect_count, range_intersect_count))
        wordCount = classiWordCount
    else:
        with open(FLAGS.merged_summary_vocab_freqs_file, "rb") as f:
            wordCount.words_list = pickle.load(f)
    logger.info(
        "merged summary vocabulary counts: %s; most frequent words: %s" %
        (len(wordCount.words_list), str(wordCount.words_list[:5])))
    if FLAGS.merged_summary_vocab_freqs_file is None:
        logger.info("saving merged summary vocabulary...")
        with open(
                osp.join(FLAGS.output_dir, "merged_summary_word_freqs.pickle"),
                "wb") as f:
            pickle.dump(wordCount.words_list, f)
        with open(
                osp.join(FLAGS.output_dir,
                         "total_%s_words" % len(wordCount.words_list)), "w"):
            pass
    # transform words
    logger.info("transforming words...")
    logger.info("transforming training article words...")
    training_article = _load_words("train/train.article.txt", wordCount,
                                   FLAGS.max_words)
    logger.info("transforming training title words...")
    training_title = _load_words("train/train.title.txt", wordCount,
                                 FLAGS.max_words)
    logger.info("transforming valid article words...")
    valid_article = _load_words("train/valid.article.filter.txt", wordCount,
                                FLAGS.max_words)
    logger.info("transforming valid title words...")
    valid_title = _load_words("train/valid.title.filter.txt", wordCount,
                              FLAGS.max_words)

    training_article = training_article + valid_article
    training_title = training_title + valid_title
    # sample
    article_pos_sample_index = rand.choice(len(training_article), 1)[0]
    title_pos_sample_index = rand.choice(len(training_title), 1)[0]
    logger.info("training_article sample: %s" %
                training_article[article_pos_sample_index])
    logger.info("training_title sample: %s" %
                training_title[title_pos_sample_index])
    # save
    logger.info("saving...")
    pickle_data = {
        "training_article": training_article,
        "training_title": training_title
    }
    with open(osp.join(FLAGS.output_dir, "summary_dataset.pickle"), "wb") as f:
        pickle.dump(pickle_data, f)
Beispiel #6
0
def generate_imdb():
    wordCount = WordCounter(lower_case=FLAGS.lower_case)
    rand = np.random.RandomState(seed=8888)
    # vocab frequences
    if FLAGS.vocab_freqs_file is None:
        logger.info("generating imdb vocabulary...")
        wordCount.fit(glob.glob(
            osp.join(FLAGS.data_dir, "train_test_unsup/*.txt")),
                      doc_count_threshold=FLAGS.doc_count_threshold)
        logger.info("saving imdb vocabulary...")
        with open(osp.join(FLAGS.output_dir, "imdb_word_freqs.pickle"),
                  "wb") as f:
            pickle.dump(wordCount.words_list, f)
    else:
        logger.info("loading imdb vocabulary...")
        with open(FLAGS.vocab_freqs_file, "rb") as f:
            wordCount.words_list = pickle.load(f)
    logger.info("vocabulary counts: %s; most frequent words: %s" %
                (len(wordCount.words_list), str(wordCount.words_list[:5])))
    # transform words
    logger.info("transforming words...")
    logger.info("transforming training-pos words...")
    training_pos_data = wordCount.transform(
        [osp.join(FLAGS.data_dir, "train_pos.txt")],
        max_words=FLAGS.max_words,
        include_unk=FLAGS.include_unk)
    n_samples_training_pos = len(training_pos_data)
    min_seqlen_training_pos = min(map(len, training_pos_data))
    max_seqlen_training_pos = max(map(len, training_pos_data))
    logger.info(
        "total number of training_pos: %s; min_seqlen in training_pos_data: %s; max_seqlen in training_pos_data: %s"
        % (n_samples_training_pos, min_seqlen_training_pos,
           max_seqlen_training_pos))
    logger.info("transforming training-neg words...")
    training_neg_data = wordCount.transform(
        [osp.join(FLAGS.data_dir, "train_neg.txt")],
        max_words=FLAGS.max_words,
        include_unk=FLAGS.include_unk)
    n_samples_training_neg = len(training_neg_data)
    min_seqlen_training_neg = min(map(len, training_neg_data))
    max_seqlen_training_neg = max(map(len, training_neg_data))
    logger.info(
        "total number of training_neg: %s; min_seqlen in training_neg_data: %s; max_seqlen in training_neg_data: %s"
        % (n_samples_training_neg, min_seqlen_training_neg,
           max_seqlen_training_neg))
    logger.info("transforming testing-pos words...")
    testing_pos_data = wordCount.transform(
        [osp.join(FLAGS.data_dir, "test_pos.txt")],
        max_words=FLAGS.max_words,
        include_unk=FLAGS.include_unk)
    n_samples_testing_pos = len(testing_pos_data)
    min_seqlen_testing_pos = min(map(len, testing_pos_data))
    max_seqlen_testing_pos = max(map(len, testing_pos_data))
    logger.info(
        "total number of testing_pos: %s; min_seqlen in testing_pos_data: %s; max_seqlen in testing_pos_data: %s"
        % (n_samples_testing_pos, min_seqlen_testing_pos,
           max_seqlen_testing_pos))
    logger.info("transforming testing-neg words...")
    testing_neg_data = wordCount.transform(
        [osp.join(FLAGS.data_dir, "test_neg.txt")],
        max_words=FLAGS.max_words,
        include_unk=FLAGS.include_unk)
    n_samples_testing_neg = len(testing_neg_data)
    min_seqlen_testing_neg = min(map(len, testing_neg_data))
    max_seqlen_testing_neg = max(map(len, testing_neg_data))
    logger.info(
        "total number of testing_neg: %s; min_seqlen in testing_neg_data: %s; max_seqlen in testing_neg_data: %s"
        % (n_samples_testing_neg, min_seqlen_testing_neg,
           max_seqlen_testing_neg))
    logger.info("transforming train_unsup words...")
    unsup_data = wordCount.transform(
        [osp.join(FLAGS.data_dir, "train_unsup.txt")],
        max_words=FLAGS.max_words,
        include_unk=FLAGS.include_unk)
    n_samples_unsup = len(unsup_data)
    min_seqlen_unsup = min(map(len, unsup_data))
    max_seqlen_unsup = max(map(len, unsup_data))
    logger.info(
        "total number of unsup: %s; min_seqlen in unsup_data: %s; max_seqlen in unsup_data: %s"
        % (n_samples_unsup, min_seqlen_unsup, max_seqlen_unsup))
    # [[0], [1], ...]
    training_pos_label = np.ones((len(training_pos_data), 1), dtype=np.int8)
    training_neg_label = np.zeros((len(training_neg_data), 1), dtype=np.int8)
    testing_pos_label = np.ones((len(testing_pos_data), 1), dtype=np.int8)
    testing_neg_label = np.zeros((len(testing_neg_data), 1), dtype=np.int8)
    # shuffle
    logger.info("shuffling docs...")
    rand.shuffle(training_pos_data)
    rand.shuffle(training_neg_data)
    rand.shuffle(testing_pos_data)
    rand.shuffle(testing_neg_data)
    rand.shuffle(unsup_data)
    # sample
    training_pos_sample_index = rand.choice(n_samples_training_pos, 1)[0]
    testing_pos_sample_index = rand.choice(n_samples_testing_pos, 1)[0]
    logger.info("training_pos sample: %s" %
                training_pos_data[training_pos_sample_index])
    logger.info("testing_pos sample: %s" %
                testing_pos_data[testing_pos_sample_index])
    # save
    logger.info("saving...")
    pickle_data = {
        "training_pos_data": training_pos_data,
        "training_neg_data": training_neg_data,
        "testing_pos_data": testing_pos_data,
        "testing_neg_data": testing_neg_data,
        "unsup_data": unsup_data,
        "training_pos_label": training_pos_label,
        "training_neg_label": training_neg_label,
        "testing_pos_label": testing_pos_label,
        "testing_neg_label": testing_neg_label
    }
    with open(osp.join(FLAGS.output_dir, "imdb_dataset.pickle"), "wb") as f:
        pickle.dump(pickle_data, f)