Example #1
0
    def is_valid_numpy():
        """
        """
        docs_ids = VectorManager.read_vector(filename)
        original = VectorManager.parse_into_4D(
            VectorManager.read_vector(file_words))
        file_list = []
        comparison = []
        unknowns = 0
        for d in range(0, len(docs_ids)):
            doc_list = []
            for p in range(0, len(docs_ids[d])):
                par_list = []
                for s in range(0, len(docs_ids[d][p])):
                    sent_list = []
                    for w in range(0, len(docs_ids[d][p][s])):
                        try:
                            translated = to_word(docs_ids[d][p][s][w])
                            if translated == '<unk>':
                                unknowns += 1
                            comparison.append(
                                translated == original[d][p][s][w])
                            sent_list.append(translated)
                        except Exception as e:
                            print("[%s] Indices %s %s %s %s: %s" %
                                  (filename, d, p, s, w, e))
                    par_list.append(sent_list)
                doc_list.append(par_list)
            file_list.append(doc_list)

        valid = False
        try:
            ratio = float(comparison.count(True)) / len(comparison)
            u_ratio = round(float(unknowns) / len(comparison), 2)
            if ratio < confidence:
                print(
                    "[WARN] File %s equality ratio is %s with %s unknown ratio"
                    % (filename, round(ratio, 2), u_ratio))
            else:
                print(
                    "[OK] File %s equality ratio is %s with %s unknown ratio" %
                    (filename, round(ratio, 2), u_ratio))
                valid = True
        except KeyError as e:
            print(
                "[ERROR] File %s is completely different (%s) with %s unknown ratio"
                % (filename, e, u_ratio))

        return valid
Example #2
0
 def __init__(self, dictionary_path, word2id, embeddings, lda=None, lsi=None):
     self.dictionary = self.load_dict(dictionary_path)
     self.word2id = VectorManager.read_vector(word2id)
     # self.word2id = self.word2id_to_id2word(word2id)
     self.embeddings = embeddings
     self.lda = lda
     self.lsi = lsi
Example #3
0
    def __iter__(self):
        """
        Defines how to iterate the MySentences class in order to feed it directly into Word2Vec method. Yields a
        sentence (as a list of words) for every iteration.
        """
        # for root, dirs, files in os.walk(self.dirname):
        for file_path in self.file_paths:
            file_data = VectorManager.read_vector(file_path)
            file_sentences = VectorManager.parse_into_sentences(file_data)

            for sentence in file_sentences:
                yield sentence
Example #4
0
 def transform_numpy():
     """
     Transforms a 4D list of words into a 4D numpy array of integers and writes it into file_out
     """
     docs = VectorManager.parse_into_4D(VectorManager.read_vector(filename))
     file_list = []
     for doc in docs:
         doc_list = []
         for paragraph in doc:
             par_list = []
             for sentence in paragraph:
                 s_id = [toId(word) for word in sentence if word]
                 if s_id:
                     par_list.append(s_id)
             doc_list.append(par_list)
         file_list.append(doc_list)
     np.save(file_out, np.array(file_list))
Example #5
0
def get_vocab_size():
    word_to_id = VectorManager.read_vector(FLAGS.word_to_id_path)
    size = len(word_to_id)
    print("Vocabulary size: %s" % size)
    return size
Example #6
0
def main(_):
    if not FLAGS.data_path:
        raise ValueError("Must set --data_path to wiki data directory list")

    vocab_size = 126930

    config = get_config()
    config.vocab_size = vocab_size

    valid_config = get_config()
    config.vocab_size = vocab_size

    eval_config = get_config()
    eval_config.batch_size = 1
    eval_config.num_steps = 1
    eval_config.vocab_size = vocab_size

    embeddings = VectorManager.read_vector(
        "%s%s.pklz" % (FLAGS.embeddings, config.embedding_size))
    files = open(FLAGS.data_path).read().split()

    training_list = files[0:int(0.8 * len(files))]
    validation_list = files[int(0.8 * len(files)):int(0.9 * len(files))]
    testing_list = files[int(0.9 * len(files)):len(files)]

    config.epoch_size = get_epoch_size(training_list, config)
    valid_config.epoch_size = get_epoch_size(validation_list, valid_config)
    eval_config.epoch_size = get_epoch_size(testing_list, eval_config)

    gen_train = generate_arrays_from_list("Train",
                                          training_list,
                                          embeddings,
                                          batch_size=config.batch_size,
                                          embedding_size=config.embedding_size,
                                          num_steps=config.num_steps)
    gen_valid = generate_arrays_from_list(
        "Validation",
        validation_list,
        embeddings,
        batch_size=valid_config.batch_size,
        embedding_size=valid_config.embedding_size,
        num_steps=valid_config.num_steps)
    gen_test = generate_arrays_from_list(
        "Test",
        testing_list,
        embeddings,
        batch_size=eval_config.batch_size,
        embedding_size=eval_config.embedding_size,
        num_steps=eval_config.num_steps)

    print("Epoch sizes\n * Training: %s\n * Validation: %s\n * Testing: %s" %
          (config.epoch_size, valid_config.epoch_size, eval_config.epoch_size))
    sys.stdout.flush()
    with tf.Graph().as_default():
        # Args: [minval, maxval]
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)

        with tf.name_scope("Train"):
            with tf.variable_scope("Model",
                                   reuse=None,
                                   initializer=initializer):
                m = WPModel(is_training=True, config=config)
            tf.summary.scalar("Training Loss", m.cost)
            tf.summary.scalar("Learning Rate", m.lr)

        with tf.name_scope("Valid"):
            with tf.variable_scope("Model",
                                   reuse=True,
                                   initializer=initializer):
                mvalid = WPModel(is_training=False, config=valid_config)
            tf.summary.scalar("Validation Loss", mvalid.cost)

        with tf.name_scope("Test"):
            with tf.variable_scope("Model",
                                   reuse=True,
                                   initializer=initializer):
                mtest = WPModel(is_training=False, config=eval_config)

        sv = tf.train.Supervisor(logdir=FLAGS.save_path)
        with sv.managed_session() as session:
            for i in range(config.max_max_epoch):
                lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0)
                m.assign_lr(session, config.learning_rate * lr_decay)

                print("Epoch: %d Learning rate: %.3f" %
                      (i + 1, session.run(m.lr)))
                train_perplexity = run_epoch(session,
                                             generator=gen_train,
                                             model=m,
                                             eval_op=m.train_op,
                                             verbose=True)
                print("Epoch: %d Train Perplexity: %.3f" %
                      (i + 1, train_perplexity))
                valid_perplexity = run_epoch(session,
                                             generator=gen_valid,
                                             model=mvalid)
                print("Epoch: %d Valid Perplexity: %.3f" %
                      (i + 1, valid_perplexity))

            test_perplexity = run_epoch(session,
                                        generator=gen_test,
                                        model=mtest)
            print("Test Perplexity: %.3f" % test_perplexity)

            if FLAGS.save_path:
                print("Saving model to %s." % FLAGS.save_path)
                sv.saver.save(session,
                              FLAGS.save_path,
                              global_step=sv.global_step)
Example #7
0
                        '--id_word_vec',
                        type=str,
                        help="Path of id <-> word <-> embedding vector",
                        required=True)
    parser.add_argument('-w',
                        '--word_vectors',
                        type=str,
                        help="Path of LM to perform the tests upon",
                        required=True)

    args = parser.parse_args()

    # Arguments parsing
    wv_path = args.word_vectors
    path = args.id_word_vec

    print("Loading model...")
    wv = KeyedVectors.load_word2vec_format(wv_path, binary=False)

    print("Loading id-word-vec...")
    id_word_vec = VectorManager.read_vector(path)

    print("Finding subset to plot")
    initial_word = 'jupiter'
    max_elements = 500
    sb = subset(initial_word, id_word_vec, wv, max_elements)

    print("Plotting subset of words...")
    # Plot t-SNE
    plot_tsne(sb)
Example #8
0
    sys.stdout.flush()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-d',
        '--data',
        type=str,
        help="Path of the data to be translated with word2id vector."
        " and clean up.",
        required=True)
    parser.add_argument(
        '-w ',
        '--word_vector',
        type=str,
        help="Word2ID vector to be used for doc reverse translation.",
        required=True)

    args = parser.parse_args()
    data_path = args.data
    word2id_file = args.word_vector

    begin = time()

    w2Id = VectorManager.read_vector(word2id_file)
    check_translated_files(data_path, w2Id)

    end = time()
    print("Total processing time: %d seconds" % (end - begin))
Example #9
0
                        help="Id2Word vector path ['wiki_en_wordids.txt'].",
                        required=True,
                        default=None)

    args = parser.parse_args()

    model_path = args.model
    id2word_path = args.id_word
    word2id_path = args.word2id_path
    emb_path = args.embeddings

    begin = time()

    dictionary = load_dict(id2word_path)
    id2word = word2id_to_id2word(word2id_path)
    w2Id = VectorManager.read_vector(word2id_path)
    embeddings = VectorManager.read_vector(emb_path)

    demo1 = "the roman consul is normally a notable person from the senate elected " \
            "by direct voting of the italic tribes"

    data = open("../data/small/AA/wiki_01_clean_simple").read().split("<eop>")
    s1 = data[0].split("<eos>")[0]
    data = open("../data/small/AA/wiki_00_clean_simple").read().split("<eop>")
    s2 = data[0].split("<eos>")[0]
    data = open("../data/small/AB/wiki_00_clean_simple").read().split("<eop>")
    s3 = data[0].split("<eos>")[0]
    data = open("../data/small/AB/wiki_01_clean_simple").read().split("<eop>")
    s4 = data[0].split("<eos>")[0]

    if "lda" in model_path:
def get_file_as_list(filename):
    words_list = VectorManager.parse_into_list(
        VectorManager.read_vector(filename))
    words_list = [w for w in words_list if w not in stop_words]
    return words_list
Example #11
0
def read_file(filename):
    return VectorManager.read_vector(filename)