Ejemplo n.º 1
0
    def __iter__(self):
        """
        Defines how to iterate the MySentences class in order to feed it directly into Word2Vec method. Yields a
        sentence (as a list of words) for every iteration.
        """
        # for root, dirs, files in os.walk(self.dirname):
        for file_path in self.file_paths:
            file_data = VectorManager.read_vector(file_path)
            file_sentences = VectorManager.parse_into_sentences(file_data)

            for sentence in file_sentences:
                yield sentence
Ejemplo n.º 2
0
    def is_valid_numpy():
        """
        """
        docs_ids = VectorManager.read_vector(filename)
        original = VectorManager.parse_into_4D(
            VectorManager.read_vector(file_words))
        file_list = []
        comparison = []
        unknowns = 0
        for d in range(0, len(docs_ids)):
            doc_list = []
            for p in range(0, len(docs_ids[d])):
                par_list = []
                for s in range(0, len(docs_ids[d][p])):
                    sent_list = []
                    for w in range(0, len(docs_ids[d][p][s])):
                        try:
                            translated = to_word(docs_ids[d][p][s][w])
                            if translated == '<unk>':
                                unknowns += 1
                            comparison.append(
                                translated == original[d][p][s][w])
                            sent_list.append(translated)
                        except Exception as e:
                            print("[%s] Indices %s %s %s %s: %s" %
                                  (filename, d, p, s, w, e))
                    par_list.append(sent_list)
                doc_list.append(par_list)
            file_list.append(doc_list)

        valid = False
        try:
            ratio = float(comparison.count(True)) / len(comparison)
            u_ratio = round(float(unknowns) / len(comparison), 2)
            if ratio < confidence:
                print(
                    "[WARN] File %s equality ratio is %s with %s unknown ratio"
                    % (filename, round(ratio, 2), u_ratio))
            else:
                print(
                    "[OK] File %s equality ratio is %s with %s unknown ratio" %
                    (filename, round(ratio, 2), u_ratio))
                valid = True
        except KeyError as e:
            print(
                "[ERROR] File %s is completely different (%s) with %s unknown ratio"
                % (filename, e, u_ratio))

        return valid
Ejemplo n.º 3
0
 def __init__(self, dictionary_path, word2id, embeddings, lda=None, lsi=None):
     self.dictionary = self.load_dict(dictionary_path)
     self.word2id = VectorManager.read_vector(word2id)
     # self.word2id = self.word2id_to_id2word(word2id)
     self.embeddings = embeddings
     self.lda = lda
     self.lsi = lsi
Ejemplo n.º 4
0
 def transform_numpy():
     """
     Transforms a 4D list of words into a 4D numpy array of integers and writes it into file_out
     """
     docs = VectorManager.parse_into_4D(VectorManager.read_vector(filename))
     file_list = []
     for doc in docs:
         doc_list = []
         for paragraph in doc:
             par_list = []
             for sentence in paragraph:
                 s_id = [toId(word) for word in sentence if word]
                 if s_id:
                     par_list.append(s_id)
             doc_list.append(par_list)
         file_list.append(doc_list)
     np.save(file_out, np.array(file_list))
Ejemplo n.º 5
0
def _transform_file(file_path, w2id, split_par=False, debug=False):
    """
    Transforms a file containing articles into a 4D list of words divided into sentences,
    paragraphs and docs. Write the result to disk with the name filename_clean.pklz
    :param file_path: file to transform
    """
    if debug:
        print("Cleaning %s" % file_path)
    with open(file_path) as f:
        data = f.read().decode("latin-1")
        docs = data.split("</doc>")
        del data
    if not split_par:
        file_out = "%s_clean_simple" % file_path
    else:
        file_out = "%s_clean_paragraph" % file_path
    file_string = ""
    for doc in [d.strip() for d in docs if d.strip()]:
        paragraphs = [
            tokenize(par)
            for par in remove_title(cleanhtml(doc)).strip().split("\n\n")
            if par
        ]
        doc_a = False
        for p in paragraphs:
            par_a = False
            for sent in p:
                line = [
                    word for word in sent.lower().split()
                    if word.isalpha() or is_number(word)
                ]

                line = " ".join([known(word, w2id) for word in line])
                if line:
                    file_string += line + " <eos> "
                    par_a = True

            if par_a and split_par:
                file_string += " <eop> "

    VectorManager.write_string(file_out, file_string.encode("latin-1"))
    del file_string
    if debug:
        print("Done with %s" % file_path)
Ejemplo n.º 6
0
def _transform_file(file_path, debug=False):
    """
    Transforms a file containing articles into a 4D list of words divided into sentences,
    paragraphs and docs. Write the result to disk with the name filename_wl (words list)
    :param file_path: file to transform
    """
    if debug:
        print("Cleaning %s" % file_path)
    with open(file_path) as f:
        raw = f.read().decode("latin-1")
        data = cleanhtml(raw)
        docs = data.split("</doc>")
        del data
    file_out = "%s_wl" % file_path
    file_string = ""
    for doc in [d.strip() for d in docs if d.strip()]:
        paragraphs = [
            tokenize(par)
            for par in remove_title(cleanhtml(doc)).strip().split("\n\n")
            if par
        ]
        doc_a = False
        for p in paragraphs:
            par_a = False
            for sent in p:
                line = " ".join([
                    word for word in sent.lower().split()
                    if word.isalpha() or is_number(word)
                ])
                if line:
                    file_string += line + "\n"
                    par_a = True
                    doc_a = True

            if par_a:
                file_string += "\n"
        if doc_a:
            file_string += "\n"

    VectorManager.write_string(file_out, file_string.encode("latin-1"))
    del file_string
    if debug:
        print("Done with %s" % file_path)
Ejemplo n.º 7
0
def generate_arrays_from_list(name,
                              files,
                              embeddings,
                              num_steps=35,
                              batch_size=20,
                              embedding_size=200):

    debug = False
    while 1:
        for file_name in files:
            print("Generating from file %s for %s" % (file_name, name))
            raw_list = VectorManager.parse_into_list(open(file_name).read())

            n_words = len(raw_list)
            batch_len = n_words // batch_size
            data = np.reshape(raw_list[0:batch_size * batch_len],
                              [batch_size, batch_len])

            for i in range(0, n_words - num_steps, 1):

                x = data[0:batch_size, i * num_steps:(i + 1) * num_steps]
                x = [[embeddings[int(elem)][2] for elem in l] for l in x]
                y = data[0:batch_size,
                         i * num_steps + 1:(i + 1) * num_steps + 1]

                if len(x[0]) < num_steps or len(y[0]) < num_steps:
                    break
                if debug:
                    print("Batch size %s\nNum steps %s\nEmbedding size %s" %
                          (batch_size, num_steps, embedding_size))
                    print("Len(x): %s\n Len(x[0] %s\n Len(x[0][0] %s" %
                          (len(x), len(x[0]), len(x[0][0])))
                    print("Len(y): %s\n Len(y[0] %s" % (len(y), len(y[0])))
                x = np.reshape(x,
                               newshape=(batch_size, num_steps,
                                         embedding_size))

                y = np.reshape(y, newshape=(batch_size, num_steps))

                yield x, y
Ejemplo n.º 8
0
def main(_):
    if not FLAGS.data_path:
        raise ValueError("Must set --data_path to wiki data directory list")

    vocab_size = 126930

    config = get_config()
    config.vocab_size = vocab_size

    valid_config = get_config()
    config.vocab_size = vocab_size

    eval_config = get_config()
    eval_config.batch_size = 1
    eval_config.num_steps = 1
    eval_config.vocab_size = vocab_size

    embeddings = VectorManager.read_vector(
        "%s%s.pklz" % (FLAGS.embeddings, config.embedding_size))
    files = open(FLAGS.data_path).read().split()

    training_list = files[0:int(0.8 * len(files))]
    validation_list = files[int(0.8 * len(files)):int(0.9 * len(files))]
    testing_list = files[int(0.9 * len(files)):len(files)]

    config.epoch_size = get_epoch_size(training_list, config)
    valid_config.epoch_size = get_epoch_size(validation_list, valid_config)
    eval_config.epoch_size = get_epoch_size(testing_list, eval_config)

    gen_train = generate_arrays_from_list("Train",
                                          training_list,
                                          embeddings,
                                          batch_size=config.batch_size,
                                          embedding_size=config.embedding_size,
                                          num_steps=config.num_steps)
    gen_valid = generate_arrays_from_list(
        "Validation",
        validation_list,
        embeddings,
        batch_size=valid_config.batch_size,
        embedding_size=valid_config.embedding_size,
        num_steps=valid_config.num_steps)
    gen_test = generate_arrays_from_list(
        "Test",
        testing_list,
        embeddings,
        batch_size=eval_config.batch_size,
        embedding_size=eval_config.embedding_size,
        num_steps=eval_config.num_steps)

    print("Epoch sizes\n * Training: %s\n * Validation: %s\n * Testing: %s" %
          (config.epoch_size, valid_config.epoch_size, eval_config.epoch_size))
    sys.stdout.flush()
    with tf.Graph().as_default():
        # Args: [minval, maxval]
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)

        with tf.name_scope("Train"):
            with tf.variable_scope("Model",
                                   reuse=None,
                                   initializer=initializer):
                m = WPModel(is_training=True, config=config)
            tf.summary.scalar("Training Loss", m.cost)
            tf.summary.scalar("Learning Rate", m.lr)

        with tf.name_scope("Valid"):
            with tf.variable_scope("Model",
                                   reuse=True,
                                   initializer=initializer):
                mvalid = WPModel(is_training=False, config=valid_config)
            tf.summary.scalar("Validation Loss", mvalid.cost)

        with tf.name_scope("Test"):
            with tf.variable_scope("Model",
                                   reuse=True,
                                   initializer=initializer):
                mtest = WPModel(is_training=False, config=eval_config)

        sv = tf.train.Supervisor(logdir=FLAGS.save_path)
        with sv.managed_session() as session:
            for i in range(config.max_max_epoch):
                lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0)
                m.assign_lr(session, config.learning_rate * lr_decay)

                print("Epoch: %d Learning rate: %.3f" %
                      (i + 1, session.run(m.lr)))
                train_perplexity = run_epoch(session,
                                             generator=gen_train,
                                             model=m,
                                             eval_op=m.train_op,
                                             verbose=True)
                print("Epoch: %d Train Perplexity: %.3f" %
                      (i + 1, train_perplexity))
                valid_perplexity = run_epoch(session,
                                             generator=gen_valid,
                                             model=mvalid)
                print("Epoch: %d Valid Perplexity: %.3f" %
                      (i + 1, valid_perplexity))

            test_perplexity = run_epoch(session,
                                        generator=gen_test,
                                        model=mtest)
            print("Test Perplexity: %.3f" % test_perplexity)

            if FLAGS.save_path:
                print("Saving model to %s." % FLAGS.save_path)
                sv.saver.save(session,
                              FLAGS.save_path,
                              global_step=sv.global_step)
Ejemplo n.º 9
0
                        '--id_word_vec',
                        type=str,
                        help="Path of id <-> word <-> embedding vector",
                        required=True)
    parser.add_argument('-w',
                        '--word_vectors',
                        type=str,
                        help="Path of LM to perform the tests upon",
                        required=True)

    args = parser.parse_args()

    # Arguments parsing
    wv_path = args.word_vectors
    path = args.id_word_vec

    print("Loading model...")
    wv = KeyedVectors.load_word2vec_format(wv_path, binary=False)

    print("Loading id-word-vec...")
    id_word_vec = VectorManager.read_vector(path)

    print("Finding subset to plot")
    initial_word = 'jupiter'
    max_elements = 500
    sb = subset(initial_word, id_word_vec, wv, max_elements)

    print("Plotting subset of words...")
    # Plot t-SNE
    plot_tsne(sb)
Ejemplo n.º 10
0
    sys.stdout.flush()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-d',
        '--data',
        type=str,
        help="Path of the data to be translated with word2id vector."
        " and clean up.",
        required=True)
    parser.add_argument(
        '-w ',
        '--word_vector',
        type=str,
        help="Word2ID vector to be used for doc reverse translation.",
        required=True)

    args = parser.parse_args()
    data_path = args.data
    word2id_file = args.word_vector

    begin = time()

    w2Id = VectorManager.read_vector(word2id_file)
    check_translated_files(data_path, w2Id)

    end = time()
    print("Total processing time: %d seconds" % (end - begin))
Ejemplo n.º 11
0
    print("Saving embeddings model...")
    model.save("../models/word2vec_gensim_%s" % emb_size)
    model.wv.save_word2vec_format("../models/word2vec_org_%s" % emb_size,
                                  "../models/vocabulary_%s" % emb_size,
                                  binary=False)

    # Get only:
    #  * word2id vector (for transforming data to numerical)
    #  * id_word_vec (actually contain word embeddings an associated id <-> word
    t3 = time()
    word2id, id_word_vec = transform_gensim(model.wv)
    t4 = time()
    print("Time transforming gensim to word2ID and idWordVec vectors: %s" %
          (t4 - t3))

    # Save model for checkpointing
    VectorManager.write_pickled("../models/word2id_%s" % emb_size, word2id)
    VectorManager.write_pickled("../models/idWordVec_%s" % emb_size,
                                id_word_vec)

    t5 = time()
    translate_files(data_path, word2id)
    t6 = time()
    print("Time translating words to numbers: %s" % (t6 - t5))

    t7 = time()
    check_translated_files(data_path, word2id)
    t8 = time()
    print("Time translating words to numbers: %s" % (t8 - t7))
Ejemplo n.º 12
0
                        help="Id2Word vector path ['wiki_en_wordids.txt'].",
                        required=True,
                        default=None)

    args = parser.parse_args()

    model_path = args.model
    id2word_path = args.id_word
    word2id_path = args.word2id_path
    emb_path = args.embeddings

    begin = time()

    dictionary = load_dict(id2word_path)
    id2word = word2id_to_id2word(word2id_path)
    w2Id = VectorManager.read_vector(word2id_path)
    embeddings = VectorManager.read_vector(emb_path)

    demo1 = "the roman consul is normally a notable person from the senate elected " \
            "by direct voting of the italic tribes"

    data = open("../data/small/AA/wiki_01_clean_simple").read().split("<eop>")
    s1 = data[0].split("<eos>")[0]
    data = open("../data/small/AA/wiki_00_clean_simple").read().split("<eop>")
    s2 = data[0].split("<eos>")[0]
    data = open("../data/small/AB/wiki_00_clean_simple").read().split("<eop>")
    s3 = data[0].split("<eos>")[0]
    data = open("../data/small/AB/wiki_01_clean_simple").read().split("<eop>")
    s4 = data[0].split("<eos>")[0]

    if "lda" in model_path:
    word2id = dict([(w, id) for id, w, _ in id_word_vec])

    return word2id, id_word_vec


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-k',
        '--kv',
        type=str,
        help="Path of the keyed vectors to translate [word2vec_org_XXX]",
        required=True)

    args = parser.parse_args()
    data_path = args.kv

    print("Loading keyed vectors")
    wv = KeyedVectors.load_word2vec_format(data_path, binary=False)

    emb_size = len(wv.syn0[0])
    word2id, id_word_vec = transform_gensim(wv)

    w2id_filepath = "../models/word2id_%s" % emb_size
    idWordVec_filepath = "../models/idWordVec_%s" % emb_size

    print("Writing files:\n\t * word2id: %s\n\t * idWordVec: %s" %
          (w2id_filepath, idWordVec_filepath))
    VectorManager.write_pickled(w2id_filepath, word2id)
    VectorManager.write_pickled(idWordVec_filepath, id_word_vec)
Ejemplo n.º 14
0
def generate_arrays_from_list(name, topic_creator, files, embeddings, num_steps=35, batch_size=20, embedding_size=200):
    eos_mark = [id for id, w, vec in embeddings if w == "<eos>"][0]
    eop_mark = [id for id, w, vec in embeddings if w == "<eop>"][0]
    unknown_embedding = [vec for id, w, vec in embeddings if w == "<unk>"][0]
    debug = False
    # print("EOS mark: %s, EOP mark: %s" % (eos_mark, eop_mark))
    while 1:
        for file_name in files:
            raw_list = VectorManager.parse_into_list(open(file_name).read())

            n_words = len(raw_list)
            batch_len = n_words // batch_size
            data = np.reshape(raw_list[0:batch_size*batch_len], [batch_size, batch_len])
            sentSegments = [list() for _ in range(batch_size)]
            parSegments = [list() for _ in range(batch_size)]


            for i in range(0, n_words - num_steps, 1):

                x = data[0:batch_size, i * num_steps:(i + 1) * num_steps]
                y = data[0:batch_size, i * num_steps + 1:(i + 1) * num_steps + 1]

                if len(x[0]) < num_steps or len(y[0]) < num_steps:
                    break


                emb_x = [[embeddings[int(elem)][2] for elem in l] for l in x]
                emb_x = np.reshape(emb_x, newshape=(batch_size, num_steps, embedding_size))

                final_x = np.zeros(shape=(batch_size, num_steps, len(embeddings[0][2])*3))
                for batch in range(0, batch_size):
                    for step in range(0, num_steps):
                        if debug:
                            print("%s == %s ? %s [eos]\n%s == %s ? %s[eop]" % (int(x[batch][step]), eos_mark,
                                                                           int(x[batch][step]) == eos_mark,
                                                                           int(x[batch][step]), eop_mark,
                                                                           int(x[batch][step]) == eop_mark))
                        if int(x[batch][step]) == eos_mark:
                            sentSegments[batch] = []
                        else:
                            sentSegments[batch].append(x[batch][step])
                        if int(x[batch][step]) == eop_mark:
                            parSegments[batch] = []
                        else:
                            parSegments[batch].append(x[batch][step])

                        sentTopic = unknown_embedding
                        parTopic = unknown_embedding
                        if sentSegments:
                            sentTopic = get_context(topic_creator, sentSegments[batch])

                        if parSegments:
                            if sentSegments[batch] == parSegments[batch]:
                                parTopic = sentTopic
                            else:
                                parTopic = get_context(topic_creator, parSegments[batch])

                        final_x[batch][step] = np.hstack((emb_x[batch][step], sentTopic, parTopic))



                if debug:
                    print("Batch size %s\nNum steps %s\nEmbedding size %s" % (batch_size, num_steps, embedding_size
                                                                              ))
                    print("Len(x): %s\n Len(x[0] %s\n Len(x[0][0] %s" % (len(x), len(x[0]), len(x[0][0])))
                    print("Len(y): %s\n Len(y[0] %s" % (len(y), len(y[0])))



                y = np.reshape(y, newshape=(batch_size, num_steps))

                yield final_x, y
Ejemplo n.º 15
0
def get_vocab_size():
    word_to_id = VectorManager.read_vector(FLAGS.word_to_id_path)
    size = len(word_to_id)
    print("Vocabulary size: %s" % size)
    return size
Ejemplo n.º 16
0
def get_file_as_list(filename):
    words_list = VectorManager.parse_into_list(
        VectorManager.read_vector(filename))
    words_list = [w for w in words_list if w not in stop_words]
    return words_list
Ejemplo n.º 17
0
def read_file(filename):
    return VectorManager.read_vector(filename)