Example #1
0
def generate_summary():
    wordCount = WordCounter(lower_case=FLAGS.lower_case)
    rand = np.random.RandomState(seed=8888)
    if FLAGS.merged_summary_vocab_freqs_file is None:
        if FLAGS.summary_vocab_freqs_file is None:
            logger.info("generating summary vocabulary...")
            wordCount.fit(glob.glob(osp.join(FLAGS.data_dir, "train/*.txt")),
                          doc_count_threshold=FLAGS.doc_count_threshold)
            logger.info("saving summary vocabulary...")
            with open(osp.join(FLAGS.output_dir, "summary_word_freqs.pickle"),
                      "wb") as f:
                pickle.dump(wordCount.words_list, f)
        else:
            logger.info("loading summary vocabulary...")
            with open(FLAGS.summary_vocab_freqs_file, "rb") as f:
                wordCount.words_list = pickle.load(f)
        logger.info("summary vocabulary counts: %s; most frequent words: %s" %
                    (len(wordCount.words_list), str(wordCount.words_list[:5])))
        logger.info("loading classi vocabulary...")
        with open(FLAGS.vocab_freqs_file, "rb") as f:
            classi_vocabs = pickle.load(f)
            classiWordCount = WordCounter(lower_case=FLAGS.lower_case)
            classiWordCount.words_list = classi_vocabs
        logger.info("classi vocabulary counts: %s; most frequent words: %s" %
                    (len(classiWordCount.words_list),
                     str(classiWordCount.words_list[:5])))
        logger.info("merging summary vocabs and classi vocabs..")
        intersect_count, range_intersect_count = classiWordCount.merge(
            wordCount, max_intersect_wordnum=FLAGS.max_words)
        print("intersect_count: %s, range_intersect_count: %s" %
              (intersect_count, range_intersect_count))
        wordCount = classiWordCount
    else:
        with open(FLAGS.merged_summary_vocab_freqs_file, "rb") as f:
            wordCount.words_list = pickle.load(f)
    logger.info(
        "merged summary vocabulary counts: %s; most frequent words: %s" %
        (len(wordCount.words_list), str(wordCount.words_list[:5])))
    if FLAGS.merged_summary_vocab_freqs_file is None:
        logger.info("saving merged summary vocabulary...")
        with open(
                osp.join(FLAGS.output_dir, "merged_summary_word_freqs.pickle"),
                "wb") as f:
            pickle.dump(wordCount.words_list, f)
        with open(
                osp.join(FLAGS.output_dir,
                         "total_%s_words" % len(wordCount.words_list)), "w"):
            pass
    # transform words
    logger.info("transforming words...")
    logger.info("transforming training article words...")
    training_article = _load_words("train/train.article.txt", wordCount,
                                   FLAGS.max_words)
    logger.info("transforming training title words...")
    training_title = _load_words("train/train.title.txt", wordCount,
                                 FLAGS.max_words)
    logger.info("transforming valid article words...")
    valid_article = _load_words("train/valid.article.filter.txt", wordCount,
                                FLAGS.max_words)
    logger.info("transforming valid title words...")
    valid_title = _load_words("train/valid.title.filter.txt", wordCount,
                              FLAGS.max_words)

    training_article = training_article + valid_article
    training_title = training_title + valid_title
    # sample
    article_pos_sample_index = rand.choice(len(training_article), 1)[0]
    title_pos_sample_index = rand.choice(len(training_title), 1)[0]
    logger.info("training_article sample: %s" %
                training_article[article_pos_sample_index])
    logger.info("training_title sample: %s" %
                training_title[title_pos_sample_index])
    # save
    logger.info("saving...")
    pickle_data = {
        "training_article": training_article,
        "training_title": training_title
    }
    with open(osp.join(FLAGS.output_dir, "summary_dataset.pickle"), "wb") as f:
        pickle.dump(pickle_data, f)
Example #2
0
def generate_imdb():
    wordCount = WordCounter(lower_case=FLAGS.lower_case)
    rand = np.random.RandomState(seed=8888)
    # vocab frequences
    if FLAGS.vocab_freqs_file is None:
        logger.info("generating imdb vocabulary...")
        wordCount.fit(glob.glob(
            osp.join(FLAGS.data_dir, "train_test_unsup/*.txt")),
                      doc_count_threshold=FLAGS.doc_count_threshold)
        logger.info("saving imdb vocabulary...")
        with open(osp.join(FLAGS.output_dir, "imdb_word_freqs.pickle"),
                  "wb") as f:
            pickle.dump(wordCount.words_list, f)
    else:
        logger.info("loading imdb vocabulary...")
        with open(FLAGS.vocab_freqs_file, "rb") as f:
            wordCount.words_list = pickle.load(f)
    logger.info("vocabulary counts: %s; most frequent words: %s" %
                (len(wordCount.words_list), str(wordCount.words_list[:5])))
    # transform words
    logger.info("transforming words...")
    logger.info("transforming training-pos words...")
    training_pos_data = wordCount.transform(
        [osp.join(FLAGS.data_dir, "train_pos.txt")],
        max_words=FLAGS.max_words,
        include_unk=FLAGS.include_unk)
    n_samples_training_pos = len(training_pos_data)
    min_seqlen_training_pos = min(map(len, training_pos_data))
    max_seqlen_training_pos = max(map(len, training_pos_data))
    logger.info(
        "total number of training_pos: %s; min_seqlen in training_pos_data: %s; max_seqlen in training_pos_data: %s"
        % (n_samples_training_pos, min_seqlen_training_pos,
           max_seqlen_training_pos))
    logger.info("transforming training-neg words...")
    training_neg_data = wordCount.transform(
        [osp.join(FLAGS.data_dir, "train_neg.txt")],
        max_words=FLAGS.max_words,
        include_unk=FLAGS.include_unk)
    n_samples_training_neg = len(training_neg_data)
    min_seqlen_training_neg = min(map(len, training_neg_data))
    max_seqlen_training_neg = max(map(len, training_neg_data))
    logger.info(
        "total number of training_neg: %s; min_seqlen in training_neg_data: %s; max_seqlen in training_neg_data: %s"
        % (n_samples_training_neg, min_seqlen_training_neg,
           max_seqlen_training_neg))
    logger.info("transforming testing-pos words...")
    testing_pos_data = wordCount.transform(
        [osp.join(FLAGS.data_dir, "test_pos.txt")],
        max_words=FLAGS.max_words,
        include_unk=FLAGS.include_unk)
    n_samples_testing_pos = len(testing_pos_data)
    min_seqlen_testing_pos = min(map(len, testing_pos_data))
    max_seqlen_testing_pos = max(map(len, testing_pos_data))
    logger.info(
        "total number of testing_pos: %s; min_seqlen in testing_pos_data: %s; max_seqlen in testing_pos_data: %s"
        % (n_samples_testing_pos, min_seqlen_testing_pos,
           max_seqlen_testing_pos))
    logger.info("transforming testing-neg words...")
    testing_neg_data = wordCount.transform(
        [osp.join(FLAGS.data_dir, "test_neg.txt")],
        max_words=FLAGS.max_words,
        include_unk=FLAGS.include_unk)
    n_samples_testing_neg = len(testing_neg_data)
    min_seqlen_testing_neg = min(map(len, testing_neg_data))
    max_seqlen_testing_neg = max(map(len, testing_neg_data))
    logger.info(
        "total number of testing_neg: %s; min_seqlen in testing_neg_data: %s; max_seqlen in testing_neg_data: %s"
        % (n_samples_testing_neg, min_seqlen_testing_neg,
           max_seqlen_testing_neg))
    logger.info("transforming train_unsup words...")
    unsup_data = wordCount.transform(
        [osp.join(FLAGS.data_dir, "train_unsup.txt")],
        max_words=FLAGS.max_words,
        include_unk=FLAGS.include_unk)
    n_samples_unsup = len(unsup_data)
    min_seqlen_unsup = min(map(len, unsup_data))
    max_seqlen_unsup = max(map(len, unsup_data))
    logger.info(
        "total number of unsup: %s; min_seqlen in unsup_data: %s; max_seqlen in unsup_data: %s"
        % (n_samples_unsup, min_seqlen_unsup, max_seqlen_unsup))
    # [[0], [1], ...]
    training_pos_label = np.ones((len(training_pos_data), 1), dtype=np.int8)
    training_neg_label = np.zeros((len(training_neg_data), 1), dtype=np.int8)
    testing_pos_label = np.ones((len(testing_pos_data), 1), dtype=np.int8)
    testing_neg_label = np.zeros((len(testing_neg_data), 1), dtype=np.int8)
    # shuffle
    logger.info("shuffling docs...")
    rand.shuffle(training_pos_data)
    rand.shuffle(training_neg_data)
    rand.shuffle(testing_pos_data)
    rand.shuffle(testing_neg_data)
    rand.shuffle(unsup_data)
    # sample
    training_pos_sample_index = rand.choice(n_samples_training_pos, 1)[0]
    testing_pos_sample_index = rand.choice(n_samples_testing_pos, 1)[0]
    logger.info("training_pos sample: %s" %
                training_pos_data[training_pos_sample_index])
    logger.info("testing_pos sample: %s" %
                testing_pos_data[testing_pos_sample_index])
    # save
    logger.info("saving...")
    pickle_data = {
        "training_pos_data": training_pos_data,
        "training_neg_data": training_neg_data,
        "testing_pos_data": testing_pos_data,
        "testing_neg_data": testing_neg_data,
        "unsup_data": unsup_data,
        "training_pos_label": training_pos_label,
        "training_neg_label": training_neg_label,
        "testing_pos_label": testing_pos_label,
        "testing_neg_label": testing_neg_label
    }
    with open(osp.join(FLAGS.output_dir, "imdb_dataset.pickle"), "wb") as f:
        pickle.dump(pickle_data, f)