Ejemplo n.º 1
0
def restrict(model: KeyedVectors):
    new_vectors = []
    new_vocab = {}
    new_index2entity = []
    new_vectors_norm = []

    for i in range(len(model.vocab)):
        if i % 10000 == 0:
            print(i)
        word = model.index2entity[i]
        vec = model.vectors[i]
        vocab = model.vocab[word]
        vec_norm = model.vectors_norm[i]
        if word.startswith('/c/en/') and '_' not in word:
            word = word[6:]
            vocab.index = len(new_index2entity)
            new_index2entity.append(word)
            new_vocab[word] = vocab
            new_vectors.append(vec)
            new_vectors_norm.append(vec_norm)

    model.vocab = new_vocab
    model.vectors = np.array(new_vectors)
    model.index2entity = np.array(new_index2entity)
    model.index2word = np.array(new_index2entity)
    model.vectors_norm = np.array(new_vectors_norm)
Ejemplo n.º 2
0
    def __init_model_out(self):
        """Create KeyVectors for w_OUT"""

        model_out = KeyedVectors(self.vector_size)
        model_out.vocab = self.model_in.wv.vocab
        model_out.index2word = self.model_in.wv.index2word
        model_out.vectors = self.model_in.trainables.syn1neg
        return model_out
Ejemplo n.º 3
0
def create_keyed_vector(old_keyed_vector, new_matrix):
    vector_size = new_matrix.shape[1]
    keyed_vector = KeyedVectors(vector_size)
    keyed_vector.vector_size = vector_size
    keyed_vector.vocab = old_keyed_vector.vocab
    keyed_vector.index2word = old_keyed_vector.index2word
    keyed_vector.vectors = new_matrix
    assert (len(old_keyed_vector.vocab),
            vector_size) == keyed_vector.vectors.shape
    return keyed_vector
Ejemplo n.º 4
0
def apply_w2v_regression(model, regression):
    """Given a word2vec model and a linear regression, apply that regression to all the vectors
    in the model.
    ::param model:: A gensim `KeyedVectors` or `Word2Vec` instance
    ::param regression:: A `sklearn.linear_model.LinearRegression` instance
    ::returns:: A gensim `KeyedVectors` instance
    """
    aligned_model = KeyedVectors()  # Word2Vec()
    aligned_model.vocab = model.vocab.copy()
    aligned_model.vector_size = model.vector_size
    aligned_model.index2word = model.index2word
    # aligned_model.reset_weights()
    aligned_model.syn0 = regression.predict(model.syn0).astype(np.float32)
    return aligned_model
Ejemplo n.º 5
0
def __create_keyed_vector(matrix, orig_vocab):
    vocab = dict()
    index_to_word = []
    for word, word_id in sorted(orig_vocab.token2id.items(),
                                key=itemgetter(1)):
        index_to_word.append(word)
        vocab[word] = Vocab(index=word_id, count=orig_vocab.word_freq[word_id])
    vector_size = matrix.shape[1]

    keyed_vector = KeyedVectors(vector_size)
    keyed_vector.vector_size = vector_size
    keyed_vector.vocab = vocab
    keyed_vector.index2word = index_to_word
    keyed_vector.vectors = matrix
    assert (len(vocab), vector_size) == keyed_vector.vectors.shape
    return keyed_vector
Ejemplo n.º 6
0
def main():
    """Entry point."""
    parser = argparse.ArgumentParser("AWD-LSTM Embeddings to Word Vectors")
    parser.add_argument("--model", required=True)
    parser.add_argument("--dictionary", required=True)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    dictionary = torch.load(args.dictionary)
    model = torch.load(args.model, map_location='cpu')
    embeddings = model[0].encoder.weight.data.cpu().numpy()

    kv = KeyedVectors(embeddings.shape[1])
    kv.syn0 = embeddings
    kv.vocab = {
        w: Vocab(index=i)
        for i, w in enumerate(dictionary.dictionary.idx2word)
    }
    kv.index2word = dictionary.dictionary.idx2word

    kv.save(args.output)
Ejemplo n.º 7
0
def convert(input_file_path,
            output_file_path=None,
            precision=DEFAULT_PRECISION,
            subword=False,
            subword_start=DEFAULT_NGRAM_BEG,
            subword_end=DEFAULT_NGRAM_END,
            approx=False,
            approx_trees=None,
            vocab_path=None):
    files_to_remove = []
    subword = int(subword)
    approx = int(approx)

    # If no output_file_path specified, create it in a tempdir
    if output_file_path is None:
        output_file_path = os.path.join(
            tempfile.mkdtemp(),
            fast_md5_file(input_file_path) + '.magnitude')
        if os.path.isfile(output_file_path):
            try:
                conn = sqlite3.connect(output_file_path)
                db = conn.cursor()
                db.execute(
                    "SELECT value FROM magnitude_format WHERE key='size'") \
                    .fetchall()[0][0]
                conn.close()
                # File already exists and is functioning
                return output_file_path
            except BaseException:
                pass

    # Check args
    meta_1_path = None
    meta_2_path = None
    input_is_text = input_file_path.endswith('.txt') or \
                    input_file_path.endswith('.vec')
    input_is_binary = input_file_path.endswith('.bin')
    input_is_hdf5 = input_file_path.endswith('.hdf5')
    input_is_hdf5_weights = input_file_path.endswith('_weights.hdf5')
    if not input_is_text and not input_is_binary and not input_is_hdf5:
        exit("The input file path must be `.txt`, `.bin`, `.vec`, or `.hdf5`")
    if not output_file_path.endswith('.magnitude'):
        exit("The output file path file path must be `.magnitude`")
    if vocab_path and not vocab_path.endswith(".magnitude"):
        exit("The vocab file path file path must be `.magnitude`")

    # Detect ELMo and ELMo options file
    input_is_elmo = False
    elmo_options_path = None
    if input_is_hdf5:
        elmo_options_path = input_file_path[0:-13] + \
                            '_options.json' if input_is_hdf5_weights else input_file_path[0:-5] + '.json'  # noqa
        if not os.path.isfile(elmo_options_path):
            exit("Expected `" + elmo_options_path +
                 "` to exist. ELMo models require a JSON options file.")
        input_is_elmo = True
        meta_1_path = input_file_path
        meta_2_path = elmo_options_path

    # Detect GloVe format and convert to word2vec if detected
    detected_glove = False
    if input_is_text:
        with io.open(input_file_path,
                     mode="r",
                     encoding="utf-8",
                     errors="ignore") as ifp:
            line1 = None
            line2 = None
            while line1 is None or line2 is None:
                line = ifp.readline().strip()
                if len(line) > 0:
                    if line1 is None:
                        line1 = line
                    elif line2 is None:
                        line2 = line
            line1 = line1.replace('\t', ' ')
            line2 = line2.replace('\t', ' ')
            line1 = line1.split()
            line2 = line2.split()
            if len(line1) == len(line2):  # No header line present
                detected_glove = True
    if detected_glove:
        eprint("Detected GloVe format! Converting to word2vec format first..."
               "(this may take some time)")
        temp_file_path = os.path.join(
            tempfile.mkdtemp(),
            os.path.basename(input_file_path) + '.txt')
        try:
            import gensim
        except ImportError:
            raise ImportError("You need gensim >= 3.3.0 installed with pip \
                (`pip install gensim`) to convert GloVe files.")
        gensim.scripts.glove2word2vec.glove2word2vec(input_file_path,
                                                     temp_file_path)
        input_file_path = temp_file_path
        files_to_remove.append(temp_file_path)

    # Open and load vector file
    eprint("Loading vectors... (this may take some time)")
    number_of_keys = None
    dimensions = None
    if input_is_binary:
        try:
            from gensim.models import KeyedVectors
        except ImportError:
            raise ImportError("You need gensim >= 3.3.0 installed with pip \
                (`pip install gensim`) to convert binary files.")
        keyed_vectors = FastText.load_fasttext_format(input_file_path)
        number_of_keys = len(keyed_vectors.wv.vectors)
        dimensions = len(keyed_vectors.wv.vectors[0])
    elif input_is_text:
        # Read it manually instead of with gensim so we can stream large models
        class KeyedVectors:
            pass

        def keyed_vectors_generator():
            number_of_keys, dimensions = (None, None)
            f = io.open(input_file_path,
                        mode="r",
                        encoding="utf-8",
                        errors="ignore")
            first_line = True
            for line in f:
                line_split = line.strip().replace('\t', ' ').split()
                if len(line_split) == 0:
                    continue
                if first_line:
                    first_line = False
                    number_of_keys = int(line_split[0])
                    dimensions = int(line_split[1])
                    yield (number_of_keys, dimensions)
                else:
                    empty_key = len(line_split) == dimensions
                    vec_floats = line_split if empty_key else line_split[1:]
                    key = "" if empty_key else line_split[0]
                    if len(vec_floats) > dimensions:
                        key = " ".join([key] + vec_floats[0:len(vec_floats) -
                                                          dimensions])
                        vec_floats = vec_floats[len(vec_floats) - dimensions:]
                    vector = np.asarray([float(elem) for elem in vec_floats])
                    yield (key, vector)

        keyed_vectors = KeyedVectors()
        kv_gen = keyed_vectors_generator()
        number_of_keys, dimensions = next(kv_gen)
        kv_gen_1, kv_gen_2 = tee(kv_gen)
        keyed_vectors.vectors = imap(lambda kv: kv[1], kv_gen_1)
        keyed_vectors.index2word = imap(lambda kv: kv[0], kv_gen_2)
    else:

        class KeyedVectors:
            pass

        keyed_vectors = KeyedVectors()
        number_of_keys = 0
        dimensions = 0
        keyed_vectors.vectors = []
        keyed_vectors.index2word = []

    eprint("Found %d key(s)" % number_of_keys)
    eprint("Each vector has %d dimension(s)" % dimensions)

    # Delete files if they exist
    try_deleting(output_file_path)
    try_deleting(output_file_path + "-shm")
    try_deleting(output_file_path + "-wal")

    # Temporarily re-direct the output to a tmp file
    output_file_path_tmp = output_file_path + '.tmp'
    output_file_path_orig = output_file_path
    output_file_path = output_file_path_tmp

    # Delete files if they exist
    try_deleting(output_file_path)
    try_deleting(output_file_path + "-shm")
    try_deleting(output_file_path + "-wal")

    # Connect to magnitude datastore
    conn = sqlite3.connect(output_file_path)
    db = conn.cursor()

    # Make the database fast
    conn.isolation_level = None
    db.execute("PRAGMA synchronous = OFF;")
    db.execute("PRAGMA default_synchronous = OFF;")
    db.execute("PRAGMA journal_mode = WAL;")
    db.execute("PRAGMA count_changes = OFF;")

    # Create table structure
    eprint("Creating magnitude format...")
    db.execute("DROP TABLE IF EXISTS `magnitude`;")
    db.execute("""
        CREATE TABLE `magnitude` (
            key TEXT COLLATE NOCASE,
            """ + ",\n".join([("dim_%d INTEGER" % i)
                              for i in range(dimensions)]) +
               ",\nmagnitude REAL" + """
        );
    """)
    db.execute("""
        CREATE TABLE `magnitude_format` (
            key TEXT COLLATE NOCASE,
            value INTEGER
        );
    """)
    if subword:
        db.execute("""
            CREATE VIRTUAL TABLE `magnitude_subword`
            USING fts3(
                char_ngrams,
                num_ngrams
            );
        """)
    if approx:
        db.execute("""
            CREATE TABLE `magnitude_approx` (
                trees INTEGER,
                index_file BLOB
            );
        """)

    metas = [('meta_1', meta_1_path), ('meta_2', meta_2_path)]
    for meta_name, meta_path in metas:
        if meta_path:
            db.execute("""
                CREATE TABLE `magnitude_""" + meta_name + """` (
                    meta_file BLOB
                );
            """)

    # Create annoy index
    approx_index = None
    if approx:
        approx_index = AnnoyIndex(dimensions)

    # Write vectors
    eprint("Writing vectors... (this may take some time)")
    insert_query = """
        INSERT INTO `magnitude`(
            key,
            """ + \
                   ",\n".join([("dim_%d" % i) for i in range(dimensions)]) + \
                   ",\nmagnitude" \
                   + """)
        VALUES (
            """ + \
                   (",\n".join(["?"] * (dimensions + 2))) \
                   + """
        );
    """
    insert_subword_query = """
        INSERT INTO `magnitude_subword`(
            char_ngrams,
            num_ngrams
        )
        VALUES (
            ?, ?
        );
    """
    counters = [Counter() for i in range(dimensions)]
    key_vectors_iterable = izip(keyed_vectors.wv.index2word,
                                keyed_vectors.wv.vectors)
    progress = -1
    db.execute("BEGIN;")
    for i, (key, vector) in enumerate(key_vectors_iterable):
        current_progress = int((float(i) / float(number_of_keys)) * 100)
        if current_progress > progress:
            progress = current_progress
            eprint("%d%% completed" % progress)
        if i % 100000 == 0:
            db.execute("COMMIT;")
            db.execute("BEGIN;")
        magnitude = np.linalg.norm(vector)
        vector = vector / magnitude
        epsilon = np.random.choice(
            [-1.0 / (10**precision), 1.0 / (10**precision)], dimensions)
        vector = epsilon if np.isnan(vector).any() else vector
        for d, v in enumerate(vector):
            counters[d][int(v * 100)] += 1
        db.execute(insert_query, (key, ) +
                   tuple(int(round(v * (10**precision)))
                         for v in vector) + (float(magnitude), ))  # noqa
        if subword:
            ngrams = set(
                (n.lower()
                 for n in char_ngrams(BOW + key +
                                      EOW, subword_start, subword_end)))
            num_ngrams = len(ngrams) * 4
            ngrams = set(
                (n for n in ngrams
                 if not any([c in SQLITE_TOKEN_SPLITTERS for c in n])))
            db.execute(insert_subword_query, (" ".join(ngrams), num_ngrams))
        if approx:
            approx_index.add_item(i, vector)
    eprint("Committing written vectors... (this may take some time)")
    db.execute("COMMIT;")

    # Figure out which dimensions have the most entropy
    entropies = [(d, entropy(counter)) for d, counter in enumerate(counters)]
    entropies.sort(key=lambda e: e[1], reverse=True)
    for e in entropies:
        eprint("Entropy of dimension %d is %f" % (e[0], e[1]))
    highest_entropy_dimensions = [e[0] for e in entropies]

    # Writing metadata
    insert_format_query = """
        INSERT INTO `magnitude_format`(
            key,
            value
        )
        VALUES (
            ?, ?
        );
    """

    db.execute(insert_format_query, ('version', CONVERTER_VERSION))
    db.execute(insert_format_query, ('elmo', input_is_elmo))
    db.execute(insert_format_query, ('size', number_of_keys))
    db.execute(insert_format_query, ('dim', dimensions))
    db.execute(insert_format_query, ('precision', precision))
    if subword:
        db.execute(insert_format_query, ('subword', subword))
        db.execute(insert_format_query, ('subword_start', subword_start))
        db.execute(insert_format_query, ('subword_end', subword_end))
    if approx:
        if approx_trees is None:
            approx_trees = max(50, int((number_of_keys / 3000000.0) * 50.0))
        db.execute(insert_format_query, ('approx', approx))
        db.execute(insert_format_query, ('approx_trees', approx_trees))
    for d in highest_entropy_dimensions:
        db.execute(insert_format_query, ('entropy', d))

    # Create indicies
    eprint("Creating search index... (this may take some time)")
    db.execute("CREATE INDEX `magnitude_key_idx` ON `magnitude` (key);")
    for i in highest_entropy_dimensions[0:1]:
        eprint("Creating spatial search index for dimension %d "
               "(it has high entropy)... (this may take some time)" % i)
        db.execute("""
            CREATE INDEX `magnitude_dim_%d_idx` ON `magnitude` (dim_%d);
        """ % (i, i))

    # Write approximate index to the database
    if approx:
        eprint("Creating approximate nearest neighbors index... \
(this may take some time)")
        approx_index.build(approx_trees)
        approx_index_file_path = os.path.join(
            tempfile.mkdtemp(),
            fast_md5_file(input_file_path) + '.ann')
        eprint("Dumping approximate nearest neighbors index... \
(this may take some time)")
        approx_index.save(approx_index_file_path)
        eprint("Compressing approximate nearest neighbors index... \
(this may take some time)")
        chunk_size = 104857600
        full_size = os.path.getsize(approx_index_file_path)
        insert_approx_query = """
            INSERT INTO magnitude_approx(trees, index_file) VALUES (?, ?);
        """
        with open(approx_index_file_path, 'rb') as ifh, \
                lz4.frame.LZ4FrameCompressor() as compressor:
            for i, chunk in enumerate(iter(partial(ifh.read, chunk_size),
                                           b'')):
                if i == 0:
                    chunk = compressor.begin() + compressor.compress(chunk)
                else:
                    chunk = compressor.compress(chunk)
                eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%")
                if len(chunk) > 0:
                    db.execute(insert_approx_query,
                               (approx_trees, sqlite3.Binary(chunk)))
            chunk = compressor.flush()
            if len(chunk) > 0:
                db.execute(insert_approx_query,
                           (approx_trees, sqlite3.Binary(chunk)))
        files_to_remove.append(approx_index_file_path)

    for meta_name, meta_path in metas:
        if not meta_path:
            continue
        eprint("Compressing meta file... \
(this may take some time)")
        chunk_size = 104857600
        full_size = os.path.getsize(meta_path)
        insert_meta_query = """
            INSERT INTO magnitude_""" + meta_name + """(meta_file)
            VALUES (?);
        """
        with open(meta_path, 'rb') as ifh, \
                lz4.frame.LZ4FrameCompressor() as compressor:
            for i, chunk in enumerate(iter(partial(ifh.read, chunk_size),
                                           b'')):
                if i == 0:
                    chunk = compressor.begin() + compressor.compress(chunk)
                else:
                    chunk = compressor.compress(chunk)
                eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%")
                if len(chunk) > 0:
                    db.execute(insert_meta_query, (sqlite3.Binary(chunk), ))
            chunk = compressor.flush()
            if len(chunk) > 0:
                db.execute(insert_meta_query, (sqlite3.Binary(chunk), ))

    # Clean up
    if len(files_to_remove) > 0:
        eprint("Cleaning up temporary files...")
        for file_to_remove in files_to_remove:
            try_deleting(file_to_remove)

    # Calculate max duplicate keys
    eprint("Finding duplicate keys... (this may take some time)")
    duplicate_keys_query = db.execute("""
        SELECT MAX(key_count)
        FROM (
            SELECT COUNT(key)
            AS key_count
            FROM magnitude
            GROUP BY key
        );
    """).fetchall()
    max_duplicate_keys = (duplicate_keys_query[0][0]
                          if duplicate_keys_query[0][0] is not None else 1
                          )  # noqa
    eprint("Found %d as the maximum number of duplicate key(s)" %
           max_duplicate_keys)
    db.execute(insert_format_query, ('max_duplicate_keys', max_duplicate_keys))

    # VACUUM
    eprint("Vacuuming to save space... (this may take some time)")
    db.execute("VACUUM;")

    # Restore safe database settings
    db.execute("PRAGMA synchronous = FULL;")
    db.execute("PRAGMA default_synchronous = FULL;")
    db.execute("PRAGMA journal_mode = DELETE;")
    db.execute("PRAGMA count_changes = ON;")

    # Clean up connection
    conn.commit()
    conn.close()
    files_to_remove.append(output_file_path + "-shm")
    files_to_remove.append(output_file_path + "-wal")

    # Clean up
    if len(files_to_remove) > 0:
        eprint("Cleaning up temporary files...")
        for file_to_remove in files_to_remove:
            try_deleting(file_to_remove)

    # Rename file the temporary output to the real output
    os.rename(output_file_path, output_file_path_orig)
    output_file_path = output_file_path_orig

    # Print success
    eprint("Successfully converted '%s' to '%s'!" %
           (input_file_path, output_file_path))

    return output_file_path
Ejemplo n.º 8
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("-p", "--pretrain", action="store_true")
    parser.add_argument("-t", "--center_trainable", action="store_true")
    parser.add_argument("-w", "--weight", type=float, default="0.5")
    parser.add_argument("-r", "--restrict", type=float, default="1")
    parser.add_argument("-c", "--cross_validate", type=int, default=0)
    parser.add_argument("-g", "--GPU", type=str, default="1")
    parser.add_argument("-e", "--num_epoch", type=int, default=20)
    parser.add_argument("-l",
                        "--relational_embedding_size",
                        type=int,
                        default=10)
    args = parser.parse_args()
    print(args)

    os.environ["CUDA_VISIBLE_DEVICES"] = args.GPU
    statsPath = "data/2018_corpus_stats.pkl"

    num_cross = str(args.cross_validate)

    train_data = DataLoader()
    train_data.read_stats(statsPath)
    train_data.load_pairs_counting("data/train_count.pkl")
    train_data.load_argument_sample_table("data/argument_sample_table.p")

    test_data = DataLoader()
    test_data.read_stats(statsPath)
    test_data.load_pairs_counting("data/test_count.pkl")
    test_data.load_argument_sample_table("data/argument_sample_table.p")

    modelPath = "data/stage_one.model"
    word2vecModel = Word2Vec.load(modelPath)

    context_wv = KeyedVectors(vector_size=300)
    context_wv.vocab = word2vecModel.wv.vocab
    context_wv.index2word = word2vecModel.wv.index2word
    context_wv.syn0 = word2vecModel.syn1neg

    pretrain_center_emb = list()
    pretrain_context_emb = list()

    counter = 0

    for i in range(len(train_data.id2word)):
        tmp_w = train_data.id2word[i]
        if tmp_w in context_wv.vocab:
            pretrain_center_emb.append(word2vecModel[tmp_w])
            pretrain_context_emb.append(context_wv[tmp_w])
        else:
            pretrain_center_emb.append(np.zeros(300))
            pretrain_context_emb.append(np.zeros(300))
            counter += 1

    print("empty count", counter)

    pretrain_center_emb = np.asarray(pretrain_center_emb)

    wordsim_dir = "./Word-similarity-dataset/Simlex/"

    with open(wordsim_dir + "verb.json", "r") as f:
        verb_list = json.load(f)

    with open(wordsim_dir + "noun.json", "r") as f:
        noun_list = json.load(f)

    with open(wordsim_dir + "adjective.json", "r") as f:
        adjective_list = json.load(f)

    with open(wordsim_dir + "all.json", "r") as f:
        all_list = json.load(f)

    simlex_corpora = [verb_list, noun_list, adjective_list, all_list]
    simlex_names = ["verb_list", "noun_list", "adjective_list", "all_list"]

    m = Model(train_data, args)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        if args.pretrain:

            # Initial assign
            sess.run(m.emb_init,
                     feed_dict={m.emb_placeholder: pretrain_center_emb})
            sess.run(
                m.emb_init_context,
                feed_dict={m.emb_placeholder_context: pretrain_context_emb})

        ws_test(sess, m, test_data, simlex_corpora, num_cross,
                args.relational_embedding_size, args.restrict)
        sp10k_test_overall(sess, m, train_data, args.relational_embedding_size,
                           args.restrict)
        test_keller_overall(sess, m, train_data)

        num_epoch = args.num_epoch
        num_batch = 256

        batch_size = 1024
        for epoch in range(num_epoch):

            print(" epoch:", str(epoch + 1), "/", num_epoch)

            process_bar = tqdm(range(num_batch))
            for i in process_bar:
                batch = train_data.get_sd_train_batch(batch_size)

                feed_dict = {
                    m.predicate_amod_ids: batch["amod"][:, 0],
                    m.argument_amod_ids: batch["amod"][:, 1],
                    m.argument_prime_amod_ids: batch["amod"][:, 2],
                    m.predicate_nsubj_ids: batch["nsubj"][:, 0],
                    m.argument_nsubj_ids: batch["nsubj"][:, 1],
                    m.argument_prime_nsubj_ids: batch["nsubj"][:, 2],
                    m.predicate_dobj_ids: batch["dobj"][:, 0],
                    m.argument_dobj_ids: batch["dobj"][:, 1],
                    m.argument_prime_dobj_ids: batch["dobj"][:, 2],
                }

                loss, _ = sess.run([m.loss, m.optimize], feed_dict=feed_dict)

                process_bar.set_description("Loss: %0.4f" % loss)

            # test
            ws_test(sess, m, test_data, simlex_corpora, num_cross,
                    args.relational_embedding_size, args.restrict)
            sp10k_test_overall(sess, m, train_data,
                               args.relational_embedding_size, args.restrict)
            test_keller_overall(sess, m, train_data)
Ejemplo n.º 9
0
print("text analysis")

# text += print_info_length(corpus_labels, lines_corpus_splitted, "corpus docs" + conf, "words", True)
text += print_info_length(queries_labels, lines_queries_splitted,
                          "queries" + conf, "words", True)

text += '\n' + str(corpus_model)

print("done.")

w1 = "night"

outv = KeyedVectors(300)
outv.vocab = corpus_model.wv.vocab  # same
outv.index2word = corpus_model.wv.index2word  # same
outv.syn0 = corpus_model.syn1neg  # different

text += '\nIN EMBEDDINGS COMPARISON:\n' + str(
    corpus_model.wv.most_similar(positive=[corpus_model[w1]], topn=6))
print("IN-IN done.")
text += '\nOUT EMBEDDINGS COMPARISON:\n' + str(
    outv.most_similar(positive=[outv[w1]], topn=6))
print("OUT-OUT done.")
text += '\nIN-OUT EMBEDDINGS COMPARISON:\n' + str(
    corpus_model.wv.most_similar(positive=[outv[w1]], topn=6))
print("IN-OUT done.")
text += '\nOUT-IN EMBEDDINGS COMPARISON:\n' + str(
    outv.most_similar(positive=[corpus_model[w1]], topn=6))
print("OUT-IN done.")
Ejemplo n.º 10
0
for query_id, query in tqdm(queries_obj.items()):
    encoded_queries[query_id] = encode(query.title, word_dict)
    print(query.title, encoded_queries[query_id])
    encoded_queries_oov[query_id] = encode_oov(query.title, word_dict)

print(encoded_queries_oov)

idf_filename = "preprocessing/pre_data/idfs/idfs" + conf
idfs = load_from_pickle_file(idf_filename)

idfs = encode_idf(idfs, word_dict)

if not glv:
    outv = KeyedVectors(300)
    outv.vocab = model.wv.vocab  # same
    outv.index2word = model.wv.index2word  # same
    outv.syn0 = model.syn1neg  # different
    we_out = encode_we(outv, word_dict, glv)

we = encode_we(model, word_dict, glv)

max_query_len = max([len(q.title.split()) for q in queries_obj.values()])

padded_query_idfs = {}
padded_query_embs = {}

print("Encoding padded queries idf and embeddings")

for query_id, query in tqdm(encoded_queries.items()
                            ):  # padding queries idfs and queries embeddings
    padded_query_idfs[query_id] = []