def main():
    parser = argparse.ArgumentParser()
    inputtype = parser.add_mutually_exclusive_group(required=True)
    inputtype.add_argument('-i', '--embedding', type=str)
    inputtype.add_argument('-d', '--directory', type=str)
    parser.add_argument('-k', '--num_clusters', type=int, required=True)
    parser.add_argument('-m', '--metric', type=str, default='cosine')
    parser.add_argument('-o', '--output', type=str)
    parser.add_argument('-b', '--batch_size', type=int, default=1024)
    args = parser.parse_args()

    if args.embedding is not None:
        output = ''.join(args.embedding.split('/')[-1].split('.')[:-1]) + '_' + str(args.metric) + '_' + str(args.num_clusters) + '.clusters' if args.output is None else args.output

        kmeans4embedding(args.embedding, output, args.num_clusters, args.metric, args.batch_size)

    else:
        files = [os.path.join(args.directory, f) for f in os.listdir(args.directory) if os.path.isfile(os.path.join(args.directory, f))]
        for i_file, file in enumerate(files):
            printTrace('==> Doing clustering for embedding ' + str(i_file) + ' of ' + str(len(files)) + ' : ' + str(file))

            if args.output is None:
                if not os.path.exists('Clustering'):
                    os.makedirs('Clustering')

            output = str(('Clustering/' if args.output is None else args.output)) + ''.join(file.split('/')[-1].split('.')[:-1]) + '_' + str(args.metric) + '_' + str(args.num_clusters) + '.clusters'

            kmeans4embedding(file, output, args.num_clusters, args.metric, args.batch_size)
Exemple #2
0
def main():
    parser = argparse.ArgumentParser()
    inputtype = parser.add_mutually_exclusive_group(required=True)
    inputtype.add_argument('-i', '--embedding', type=str)
    inputtype.add_argument('-d', '--directory', type=str)
    parser.add_argument('-o', '--output', required=True)

    parser.add_argument('-v', '--vocab', default=None)
    parser.add_argument('-nl', '--length_normalize', action='store_true')
    parser.add_argument('-nd',
                        '--normalize_dimensionwise',
                        action='store_true')
    parser.add_argument('-l', '--lower', action='store_true')

    outputtype = parser.add_mutually_exclusive_group(required=True)
    outputtype.add_argument('-w2v', '--word2vec', action='store_true')
    outputtype.add_argument('-glv', '--glove', action='store_true')
    args = parser.parse_args()

    if args.embedding:
        emb_converter(args.embedding, args.output, args)
    else:
        files = [
            os.path.join(args.directory, f) for f in os.listdir(args.directory)
            if os.path.isfile(os.path.join(args.directory, f))
        ]

        for i_file, file in enumerate(files):
            printTrace('Converting Embedding ' + str(i_file) + ' of ' +
                       str(len(files)) + ' : ' + str(file))
            emb_converter(file, args.output + '/' + file.split('/')[-1], args)
def kmeans4embedding(embedding_path, output_path, k, metric, batch_size):
    printTrace('Loading embedding ' + str(embedding_path))

    emb = load_embedding(embedding_path, lower=False, length_normalize=False, delete_duplicates=True)

    printTrace('Clustering for embedding ' + str(embedding_path))

    labels = doKmeans(emb.vectors, k, metric, batch_size)

    printTrace('Printing clusters for embedding ' + str(embedding_path))

    with open(output_path, 'w') as file:
        for i_label, label in enumerate(labels):
            print(emb.vocabulary.index_to_word(i_label) + ' ' + str(label), file=file)

    printTrace('Sorting clusters for embedding ' + str(embedding_path))

    excec_com = 'sort -k2 -n ' + str(output_path) + ' > ' + str(output_path) + '_sorted'
    print(excec_com)
    os.system(excec_com)
    excec_com = 'rm ' + str(output_path)
    print(excec_com)
    os.system(excec_com)
    excec_com = 'mv ' + str(output_path) + '_sorted ' + str(output_path)
    print(excec_com)
    os.system(excec_com)

    printTrace('Done, clusters saved in ' + str(output_path))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--directory', type=str)
    parser.add_argument('-o', '--output_directory', type=str)
    parser.add_argument('-m',
                        '--method',
                        choices=['PCA', 'tSVD', 'DRA'],
                        default='DRA')
    parser.add_argument('-n', '--reduce_to', type=int, default=300)
    parser.add_argument('-b', '--do_in_batches', action='store_true')

    args = parser.parse_args()

    files = [
        os.path.join(args.directory, f) for f in os.listdir(args.directory)
        if os.path.isfile(os.path.join(args.directory, f))
    ]

    for i_file, file in enumerate(files):
        printTrace('Dimensionality reduction: Embedding ' + str(i_file) +
                   ' of ' + str(len(files)) + ' : ' + str(file))

        excec_com = 'python3 dimensionality_reduction.py -i ' + str(file) + ' -m ' + str(args.method) + ' -o ' +\
                    args.output_directory + file.split('/')[-1] + '_' + str(args.method) +'.vec -n ' +\
                    str(args.reduce_to) + (' -b ' if args.do_in_batches else '')
        print(excec_com)
        os.system(excec_com)
def main():
    parser = argparse.ArgumentParser()
    inputtype = parser.add_mutually_exclusive_group(required=True)
    inputtype.add_argument('-i', '--embedding', type=str)
    inputtype.add_argument('-d', '--directory', type=str)

    parser.add_argument('-b', '--batch_size', type=int, default=512)
    parser.add_argument('-dic',
                        '--dictionary_path',
                        type=str,
                        default='DictionaryInductionDataset/es-en.test')
    parser.add_argument('-p', '--add_lang_prefix', action='store_true')

    args = parser.parse_args()

    emb_list = []
    if args.embedding is not None:
        emb_list.append(args.embedding)
    else:
        emb_list = [
            os.path.join(args.directory, f) for f in os.listdir(args.directory)
            if os.path.isfile(os.path.join(args.directory, f))
        ]

    if not os.path.exists('Results'):
        os.makedirs('Results')

    for emb_i, emb_path in enumerate(emb_list):

        printTrace('Evaluating Embedding ' + str(emb_i + 1) + ' of ' +
                   str(len(emb_list)) + ' : ' + str(emb_path))
        emb = load_embedding(emb_path,
                             lower=False,
                             length_normalize=True,
                             delete_duplicates=True)

        top1, top2, top3, top5, top10, coverage = evaluate_dictionary_induction(
            emb, args.dictionary_path, args.batch_size, emb_path,
            args.add_lang_prefix)

        with open('Results/dictionary_induction', 'a+') as file:
            print(','.join([
                str(emb_path),
                str(top1),
                str(top2),
                str(top3),
                str(top5),
                str(top10),
                str(coverage)
            ]),
                  file=file)

    print('Results have been exported in csv format to the Results folder')
def emb_converter(path_input, path_output, args):
    printTrace('Loading Embedding ' + str(path_input) + '...')
    format = 'bin' if path_input.split('/')[-1].split('.')[-1] == 'bin' else 'text'

    emb = load_embedding(path_input, format=format,
                             vocabulary=None if args.vocab is None else vocab_from_path(args.vocab),
                             length_normalize=args.length_normalize,
                             normalize_dimensionwise=args.normalize_dimensionwise, to_unicode=True,
                             lower=args.lower, path2='', delete_duplicates=True, method_vgg="delete")

    printTrace('Saving result to ' + str(path_output) + '...')

    num_words = 0
    with open(path_output, 'w+') as file:
        for i_word, word in enumerate(emb.words):

            if i_word % 5000 ==0:
                string = "<" + str(datetime.datetime.now()) + ">  " + 'Converting : ' + str(
                    int(100 * i_word / len(emb.words))) + '%'
                print(string, end="\r")
            if args.language is None or any(l in word.split(args.delimiter) for l in args.language):
                print(word.split(args.delimiter)[-1] + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]), file=file)
                num_words+=1

    print()

    if args.word2vec:
        excec_com = 'sed -i \'1s/^/' + str(num_words) + ' ' + str(emb.dims) + '\\n/\' ' + str(path_output)
        print(excec_com)
        os.system(excec_com)

    printTrace('Done.')
Exemple #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--embeddings', nargs='+', required=True)
    parser.add_argument('-o', '--output', type=str, required=True)
    args = parser.parse_args()

    printTrace('Loading vocabulary from embeddings...')
    vocab_embeddings = [vocab_from_path(x) for x in args.embeddings]
    union_vocab = (set.union(*vocab_embeddings))
    printTrace('Te union of the vocabulary has ' + str(len(union_vocab)) +
               ' words.')
    printTrace('Printing vocabulary in ' + args.output + '...')
    with open(args.output, 'w+') as file:
        for word in union_vocab:
            print(word, file=file)
Exemple #8
0
def emb_converter(path_input, path_output, args):
    printTrace('Loading Embedding ' + str(path_input) + '...')
    format = 'bin' if path_input.split('/')[-1].split(
        '.')[-1] == 'bin' else 'text'

    emb = load_embedding(
        path_input,
        format=format,
        vocabulary=None if args.vocab is None else vocab_from_path(args.vocab),
        length_normalize=args.length_normalize,
        normalize_dimensionwise=args.normalize_dimensionwise,
        to_unicode=True,
        lower=args.lower,
        path2='',
        delete_duplicates=True,
        method_vgg="delete")

    printTrace('Saving result to ' + str(path_output) + '...')
    emb.export(path=path_output, printHeader=args.word2vec)

    printTrace('Done.')
Exemple #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--embedding", type=str, required=True)
    parser.add_argument("-c", "--emb_4_generation", type=str, required=True)
    parser.add_argument("-d", "--dataset", type=str, required=True)
    parser.add_argument("-b", "--batch_size", type=int, default=1024)
    parser.add_argument("-k", "--num_nearest_neighbor", type=int, default=10)

    args = parser.parse_args()

    dims = get_dimensions(args.embedding)

    if dims != get_dimensions(args.emb_4_generation):
        raise ValueError(
            "All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format"
        )

    printTrace("Reading vocab...")

    vocab_emb = vocab_from_path(args.embedding)
    vocab_cross = vocab_from_path(args.emb_4_generation)
    dataset = get_dataset(args.dataset)
    vocab_to_generate = list(
        set(np.append((dataset.X[:, 0]), (dataset.X[:, 1]))))
    vocab_to_generate_set = set(vocab_to_generate)
    vocab_emb_delete = [x for x in vocab_emb if x not in vocab_to_generate_set]

    total_vocab = set.union(set(vocab_emb_delete), set(vocab_cross))
    interset_vocab = list(
        set.intersection(set(vocab_emb_delete), set(vocab_cross)))

    print("Final embedding will have " + str(len(total_vocab)) + " words")
    print("We will generate " + str(len(vocab_to_generate)) + " words")

    emb = load_embedding(
        args.emb_4_generation,
        vocabulary=None,
        lower=False,
        length_normalize=True,
        normalize_dimensionwise=False,
        delete_duplicates=True,
    )

    m = emb.words_to_matrix(vocab_to_generate)
    M = emb.words_to_matrix(interset_vocab)

    nn = []

    for i_batch, mb in enumerate(batch(m, args.batch_size)):

        string = ("<" + str(datetime.datetime.now()) + ">  " +
                  "Using Embedding " + str(args.emb_4_generation) +
                  " to generate vocab for Embedding " + str(args.embedding) +
                  ":  " + str(int(100 *
                                  (i_batch * args.batch_size) / len(m))) + "%")
        print(string, end="\r")

        # print(np.asarray(mb).shape)
        # print(np.asarray(M).shape)

        result = cosine_knn(mb, M, args.num_nearest_neighbor)

        for i_result, indexes in enumerate(result):
            nn.append([interset_vocab[i] for i in indexes])

    del emb

    printTrace("===> Generating new_vocab <===")

    emb = load_embedding(
        args.embedding,
        vocabulary=vocab_emb_delete,
        lower=False,
        length_normalize=False,
        normalize_dimensionwise=False,
        delete_duplicates=True,
    )

    new_vectors = []
    for i_word, word in enumerate(vocab_to_generate):
        if i_word % 1000 == 0:
            string = ("<" + str(datetime.datetime.now()) + ">  " +
                      "Generating vocab " + ": " +
                      str(int(100 * i_word / len(vocab_to_generate))) + "%")
            print(string, end="\r")

        try:
            lw = nn[i_word]
            v = np.zeros([dims], dtype=float)
            for word_nn in lw:
                v += emb.word_to_vector(word_nn)

        except KeyError as r:
            raise ValueError(
                "Something went wrong in the word generation process")

        new_vectors.append(v / args.num_nearest_neighbor)

    print()

    del emb

    printTrace("===> Loading embeddings to compare <===")
    emb_generated = Embedding(vocabulary=Vocabulary(vocab_to_generate),
                              vectors=new_vectors)
    emb_original = load_embedding(
        args.embedding,
        vocabulary=vocab_to_generate,
        lower=False,
        length_normalize=False,
        normalize_dimensionwise=False,
        delete_duplicates=True,
    )

    printTrace("===> Evaluate <===")

    print("Original Embedding: ", end="")
    print(
        similarity_emd(
            emb_original,
            dataset.X,
            dataset.y,
            backoff_vector=None,
            lower=False,
            lang1prefix=None,
            lang2prefix=None,
        ))
    print("Generated Embedding: ", end="")
    print(
        similarity_emd(
            emb_generated,
            dataset.X,
            dataset.y,
            backoff_vector=None,
            lower=False,
            lang1prefix=None,
            lang2prefix=None,
        ))
Exemple #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--embeddings", nargs="+", required=True)
    parser.add_argument("-t", "--rotate_to", required=True)
    parser.add_argument("-o", "--output", required=True)
    parser.add_argument("-v", "--vocabulary", default=None)
    parser.add_argument("-b", "--batch_size", type=int, default=256)
    parser.add_argument("-k", "--num_nearest_neighbor", type=int, default=10)
    parser.add_argument("-r", "--retrofitting", default=None)
    parser.add_argument("-rn", "--retrofitting_n_iters", type=int, default=10)
    # parser.add_argument('-n', '--do_not_normalize_embs', default=False)
    parser.add_argument("-ir", "--do_not_retrofit_rotate_to", default=False)
    parser.add_argument("-nc", "--do_not_clean_files", default=False)
    parser.add_argument("-oov", "--generate_oov_words", action="store_false")

    args = parser.parse_args()

    is_rot_in_input = None

    for emb_i, emb in enumerate(args.embeddings):
        if emb == args.rotate_to:
            is_rot_in_input = emb_i

    if not os.path.exists("tmp"):
        os.makedirs("tmp")

    print(
        "tmp folder created, it will be deleted at the end of the execution (unless you have run the program with the -nc True option)"
    )

    if args.retrofitting is not None:
        printTrace("==> Retrofitting <==")
        for emb_i, emb in enumerate(args.embeddings):
            string = (str(emb_i + 1) + " of " + str(
                len(args.embeddings) if is_rot_in_input is not None or args.
                do_not_retrofit_rotate_to else str(len(args.embeddings) + 1)))
            print(string)
            excec_com = ("python3 Retrofitting/retrofit.py -i " + str(emb) +
                         " -l " + str(args.retrofitting) + " -n " +
                         str(args.retrofitting_n_iters) + " -o " + "tmp/" +
                         str(emb_i) + ".retro -d " + str(get_dimensions(emb)))
            print(excec_com)
            os.system(excec_com)

        if is_rot_in_input is not None and not args.do_not_retrofit_rotate_to:
            string = (str(len(args.embeddings + 1)) + " of " +
                      str(len(args.embeddings)) if is_rot_in_input is not None
                      or args.do_not_retrofit_rotate_to else
                      str(len(args.embeddings) + 1))
            print(string)
            excec_com = ("python3 Retrofitting/retrofit.py -i " +
                         str(args.rotate_to) + " -l " +
                         str(args.retrofitting) + " -n " +
                         str(args.retrofitting_n_iters) + " -o " + "tmp/" +
                         "out.retro  -d " + str(get_dimensions(emb)))
            print(excec_com)
            os.system(excec_com)

        print()

    printTrace("==> Generating dictionaries for the mapping <==")

    for emb_i, emb in enumerate(args.embeddings):
        string = str(emb_i + 1) + " of " + str(len(args.embeddings))
        print(string)
        print_dictionary_for_vecmap(
            "tmp/" + str(emb_i) + ".dict",
            generate_dictionary_for_vecmap(path1=emb, path2=args.rotate_to),
        )

    print()

    printTrace("==> Normalizing Embeddings <==")

    for emb_i, emb in enumerate(args.embeddings):
        string = (str(emb_i + 1) + " of " + str(
            len(args.embeddings) if is_rot_in_input is not None else str(
                len(args.embeddings) + 1)))
        print(string)
        excec_com = ("python3 VecMap/normalize_embeddings.py unit center -i " +
                     (emb if args.retrofitting is None else "tmp/" +
                      str(emb_i) + ".retro") + " -o tmp/" + str(emb_i) +
                     ".norm")
        print(excec_com)
        os.system(excec_com)

    if is_rot_in_input is None:
        string = str(len(args.embeddings) +
                     1) + " of " + str(len(args.embeddings) + 1)
        print(string)
        excec_com = ("python3 VecMap/normalize_embeddings.py unit center -i " +
                     (args.rotate_to if args.retrofitting is None
                      or args.do_not_retrofit_rotate_to else "tmp/out.retro") +
                     " -o tmp/out.norm")
        print(excec_com)
        os.system(excec_com)

    print()

    printTrace("==> Mapping Embeddings <==")

    for emb_i, emb in enumerate(args.embeddings):

        if is_rot_in_input is None or (is_rot_in_input is not None
                                       and is_rot_in_input != emb_i):

            string = (str(emb_i + 1) + " of " +
                      str(len(args.embeddings) - 1) if is_rot_in_input
                      is not None else str(len(args.embeddings) + 1))
            print(string)

            source_input = "tmp/" + str(emb_i) + ".norm"
            target_input = ("tmp/out.norm" if is_rot_in_input is None else
                            "tmp/" + str(is_rot_in_input) + ".norm")
            source_output = "tmp/" + str(emb_i) + ".vecmap"
            target_output = "tmp/out.vecmap"
            dictionary = "tmp/" + str(emb_i) + ".dict"

            excec_com = ("python3 VecMap/map_embeddings.py --orthogonal " +
                         source_input + " " + target_input + " " +
                         source_output + " " + target_output + " -d " +
                         dictionary)
            print(excec_com)
            os.system(excec_com)

    print()

    printTrace("==> Generating Meta Embedding <==")

    embs = ""
    for emb_i, emb in enumerate(args.embeddings):
        if is_rot_in_input is None or (is_rot_in_input is not None
                                       and is_rot_in_input != emb_i):
            embs = embs + "tmp/" + str(emb_i) + ".vecmap "

    if is_rot_in_input is not None:
        embs = embs + "tmp/out.vecmap "

    excec_com = ("python3 embeddings_mean.py -i " + embs + "-o " +
                 args.output + " -b " + str(args.batch_size) + " -k " +
                 str(args.num_nearest_neighbor))

    if not args.generate_oov_words:
        excec_com = excec_com + " -oov"
    if args.vocabulary is not None:
        excec_com = excec_com + " -v " + args.vocabulary
    print(excec_com)
    os.system(excec_com)

    print()
    print("Done! Meta embedding generated in " + args.output)

    if not args.do_not_clean_files:
        print("Cleaning files...")
        try:
            os.system("rm -rf tmp")
        except:
            print("Could not delete the tmp folder, do it manually")
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--embedding', required=True)
    parser.add_argument('-c', '--cross_embedding', required=True)
    parser.add_argument('-o', '--output', required=True)
    parser.add_argument('-b', '--batch_size', type=int, default=1024)
    parser.add_argument('-k', '--num_nearest_neighbor', type=int, default=10)


    args = parser.parse_args()


    dims = get_dimensions(args.embedding)

    if dims != get_dimensions(args.cross_embedding):
        raise ValueError('All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format')

    printTrace('Reading vocab...')

    vocab_emb = vocab_from_path(args.embedding)
    vocab_cross = vocab_from_path(args.cross_embedding)

    total_vocab = set.union(set(vocab_emb), set(vocab_cross))
    interset_vocab = list(set.intersection(set(vocab_emb), set(vocab_cross)))
    vocab_to_generate = set(vocab_cross) - set(vocab_emb)

    print('Final embedding will have ' + str(len(total_vocab)) + ' words')
    print('We will generate ' + str(len(vocab_to_generate)) + ' words')

    emb = load_embedding(args.cross_embedding, vocabulary=None, lower=False, length_normalize=True, normalize_dimensionwise=False,
                         delete_duplicates=True)

    m = emb.words_to_matrix(vocab_to_generate)

    M = emb.words_to_matrix(interset_vocab)

    nn=[]

    for i_batch, mb in enumerate(batch(m, args.batch_size)):

        string = "<" + str(datetime.datetime.now()) + ">  " + 'Using Embedding ' + str(
            args.cross_embedding) + ' to generate vocab for Embedding ' + str(args.embedding) + ':  ' + str(
            int(100 * (i_batch * args.batch_size) / len(m))) + '%'
        print(string, end="\r")

        # print(np.asarray(mb).shape)
        # print(np.asarray(M).shape)


        result = cosine_knn(mb, M, args.num_nearest_neighbor)

        for i_result, indexes in enumerate(result):
            nn.append([interset_vocab[i] for i in indexes])

    del emb


    printTrace('===> Generating new_vocab <===')

    emb = load_embedding(args.embedding, vocabulary=None, lower=False, length_normalize=False, normalize_dimensionwise=False,
                         delete_duplicates=True)



    new_vectors = []
    for i_word, word in enumerate(vocab_to_generate):
        if i_word%1000 == 0:
            string = "<" + str(datetime.datetime.now()) + ">  " + 'Generating vocab ' + args.output + ': ' + str(
                int(100 * i_word / len(vocab_to_generate))) + '%'
            print(string, end="\r")

        try:
            lw = nn[i_word]
            v = np.zeros([dims], dtype=float)
            for word_nn in lw:
                v += emb.word_to_vector(word_nn)

        except KeyError as r:
            raise ValueError('Something went wrong in the word generation process')

        new_vectors.append(v/args.num_nearest_neighbor)

    print()


    printTrace('===> Printing to file <===')

    with open(args.output,'w') as file:

        print(str(len(emb.words)+len(vocab_to_generate)) + ' ' + str(dims),file=file)

        for w in emb.words:
            print(w + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(w)]), file=file)

        for w_i, w in enumerate(vocab_to_generate):
            print(w + ' ' + ' '.join(['%.6g' % x for x in new_vectors[w_i]]), file=file)
Exemple #12
0
def main():
    parser = argparse.ArgumentParser()
    inputtype = parser.add_mutually_exclusive_group(required=True)
    inputtype.add_argument('-i', '--embedding', type=str)
    inputtype.add_argument('-d', '--directory', type=str)

    #parser.add_argument('-n', '--name_of_embedding', default=None)
    parser.add_argument('-l', '--lowercase_dataset', action='store_true')
    parser.add_argument('-lg', '--language', nargs='+', default=['en'])

    parser.add_argument('-p', '--add_lang_prefix', action='store_true')

    parser.add_argument('-v', '--vocab', type=str, default=None)

    args = parser.parse_args()

    emb_list = []

    if args.embedding is not None:
        emb_list.append(args.embedding)
    else:
        emb_list = [
            os.path.join(args.directory, f) for f in os.listdir(args.directory)
            if os.path.isfile(os.path.join(args.directory, f))
        ]

    for emb_i, emb_path in enumerate(emb_list):

        printTrace('Evaluating Embedding ' + str(emb_i + 1) + ' of ' +
                   str(len(emb_list)) + ' : ' + str(emb_path))

        emb = load_embedding(emb_path,
                             vocabulary=(None if args.vocab is None else
                                         vocab_from_path(args.vocab)),
                             lower=False,
                             length_normalize=False,
                             delete_duplicates=True)

        for lang in args.language:

            lang1prefix = None
            lang2prefix = None

            if args.add_lang_prefix:
                if lang == 'en':
                    lang1prefix = 'en'
                    lang2prefix = 'en'
                elif lang == 'es':
                    lang1prefix = 'es'
                    lang2prefix = 'es'
                elif lang == 'enes':
                    lang1prefix = 'en'
                    lang2prefix = 'es'
                else:
                    logging.warning(
                        'Language not supported, could not add prefix')

            if not os.path.exists('Results_' + lang):
                os.makedirs('Results_' + lang)

            print('>>> Results deleting oov <<< ')

            a, b = results_to_csv(evaluate_on_all(
                emb,
                backoff_vector=None,
                lowercase_dataset=args.lowercase_dataset,
                lang=lang,
                lang1prefix=lang1prefix,
                lang2prefix=lang2prefix),
                                  printRes=False,
                                  returnRes=True)
            export_to_csv(
                txtResults=a,
                txtCov=b,
                name=emb_path,
                filenameResults='Results_' + lang + '/Sim_Results_delete.csv',
                filenameCoverage='Results_' + lang + '/Sim_Coverage.csv')

            print('>>> Result using mean of all word vectors as OOV <<<')

            a, b = results_to_csv(evaluate_on_all(
                emb,
                backoff_vector=np.mean(emb.vectors, axis=0),
                lowercase_dataset=args.lowercase_dataset,
                lang=lang,
                lang1prefix=lang1prefix,
                lang2prefix=lang2prefix),
                                  printRes=False,
                                  returnRes=True)
            export_to_csv(
                txtResults=a,
                txtCov=b,
                name=emb_path,
                filenameResults='Results_' + lang + '/Sim_Results_mean.csv',
                filenameCoverage='Results_' + lang + '/Sim_Coverage.csv')

    print('Results have been exported in csv format to the Results folder')
def concatenate_embeddings_generate(embeddings_path,
                                    out_path,
                                    vocab=None,
                                    batch_size=1024,
                                    k=10):
    printTrace("Reading vocab...")

    # [[vocab_emb1], [vocab_emb_2], ...]
    vocab_embeddings = [vocab_from_path(x) for x in embeddings_path]

    word_id = set()

    if vocab is None:
        word_id = list(set.union(*vocab_embeddings))
    else:
        word_id = set(vocab)
        union = set.union(*vocab_embeddings)
        [
            print("Word " + str(w) + " not found in any embedding")
            for w in word_id - union
        ]
        word_id = list(word_id.intersection(union))

    print("The final embedding will have " + str(len(word_id)) + " words.")

    for i_voc, voc in enumerate(vocab_embeddings):
        print("Embedding " + str(i_voc) + " has " + str(len(voc)) + " words.")
        print("We will generate " + str(len(set(word_id) - voc)) +
              " words for the embedding " + str(i_voc))

    print()

    printTrace("Building matrix for word generation...")
    generation_vocab_matrix = [[x for x in range(len(embeddings_path))]
                               for x in range(len(embeddings_path))]
    nn_vocab = [defaultdict() for x in range(len(embeddings_path))]

    for x, emb1 in enumerate(vocab_embeddings):
        vocab_to_generate = set(word_id) - emb1
        for y, emb2 in enumerate(vocab_embeddings):
            generation_vocab_matrix[y][x] = list(
                vocab_to_generate.intersection(emb2))
            vocab_to_generate = vocab_to_generate - emb2

    printTrace("===> Calculating nearest neighbors <===")

    for i_emb_path, emb_path in enumerate(embeddings_path):

        printTrace("Loading file: " + str(emb_path))
        emb = load_embedding(
            emb_path,
            vocabulary=None,
            length_normalize=True,
            normalize_dimensionwise=False,
            delete_duplicates=True,
        )

        for i_g, g in enumerate(generation_vocab_matrix[i_emb_path]):
            if len(g) > 0:
                # print('G: ' + str(g))
                m = emb.words_to_matrix(
                    g)  # generation_vocab_matrix[i_emb_path][i_g])

                # print(len(m))
                # print(generation_vocab_matrix[x][gi])

                interset_vocab = list(
                    set.intersection(vocab_embeddings[i_emb_path],
                                     vocab_embeddings[i_g]))

                M = emb.words_to_matrix(interset_vocab)

                total_words = len(m)

                for i_batch, mb in enumerate(batch(m, batch_size)):

                    string = (
                        "<" + str(datetime.datetime.now()) + ">  " +
                        "Using Embedding " + str(i_emb_path) +
                        " to generate vocab for Embedding " + str(i_g) +
                        ":  " +
                        str(int(100 *
                                (i_batch * batch_size) / total_words)) + "%")
                    print(string, end="\r")

                    result = cosine_knn(mb, M, k)
                    for i_result, indexes in enumerate(result):
                        nn_vocab[i_g][g[i_result + (batch_size * i_batch)]] = [
                            interset_vocab[i] for i in indexes
                        ]

                print()

    printTrace("===> Calculating meta embedding <===")

    total_words = len(word_id)
    first_emb = True

    if not os.path.exists("tmp"):
        os.makedirs("tmp")

    total_dims = 0

    for x, emb_path in enumerate(embeddings_path):
        matrix = []
        printTrace("Loading file: " + str(emb_path))

        emb = load_embedding(
            emb_path,
            vocabulary=None,
            length_normalize=True,
            normalize_dimensionwise=False,
            delete_duplicates=True,
        )

        total_dims += emb.dims

        string = "<" + str(
            datetime.datetime.now()) + ">  " + "Embedding " + str(x)
        print(string, end="\r")

        actual_matrix = []

        for wi, w in enumerate(word_id):
            m = np.zeros([emb.dims], dtype=float)
            try:
                m = emb.word_to_vector(w)
            except KeyError as r:
                try:
                    lw = nn_vocab[x][w]
                    v = np.zeros([emb.dims], dtype=float)
                    for word in lw:
                        v += emb.word_to_vector(word)

                except KeyError as r:
                    raise ValueError(
                        "Something went wrong in the word generation process")

                m = normalize_vector(v / k)

            matrix.append(m)

            if wi % 1000 == 0:
                string = ("<" + str(datetime.datetime.now()) + "> " +
                          "Calculating meta embeddind for embedding " +
                          str(x) + ": " + str(int(100 * wi / total_words)) +
                          "%")
                print(string, end="\r")
        print()

        with open("tmp/" + str(x), "w") as file:
            for wi, w in enumerate(word_id):
                if first_emb:
                    print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)
                else:
                    print(" ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)

                if wi % 1000 == 0:
                    string = ("<" + str(datetime.datetime.now()) + "> " +
                              "Saving embedding " + str(x) + " to file : " +
                              str(int(100 * wi / total_words)) + "%")
                    print(string, end="\r")

            print()

        first_emb = False

    printTrace("Concatenation...")

    excec_com = "paste -d ' ' "
    for x in range(len(embeddings_path)):
        excec_com = excec_com + "tmp/" + str(x) + " "
    excec_com = excec_com + "> " + str(out_path)
    print(excec_com)
    os.system(excec_com)

    excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) +
                 "\\n/' " + str(out_path))
    print(excec_com)
    os.system(excec_com)

    try:
        os.system("rm -rf tmp")
    except:
        print("Could not delete the tmp folder, do it manually")

    printTrace("Done. Meta embedding saved in " + str(out_path))
def concatenate_embeddings(
    embeddings_path,
    out_path,
    vocab,
):
    printTrace("===> Calculating meta embedding (No OOV) <===")

    vocab_embeddings = [vocab_from_path(x) for x in embeddings_path]

    if vocab is None:
        word_id = list(set.union(*vocab_embeddings))
    else:
        word_id = set(vocab)
        union = set.union(*vocab_embeddings)
        [
            print("Word " + str(w) + " not found in any embedding")
            for w in word_id - union
        ]
        word_id = list(word_id.intersection(union))

    print("The final embedding will have " + str(len(word_id)) + " words.")

    first_emb = True

    if not os.path.exists("tmp_conc"):
        os.makedirs("tmp_conc")

    total_dims = 0

    for x, emb_path in enumerate(embeddings_path):
        matrix = []
        printTrace("Loading file: " + str(emb_path))

        emb = load_embedding(
            emb_path,
            vocabulary=None,
            length_normalize=True,
            normalize_dimensionwise=False,
            delete_duplicates=True,
        )

        total_dims += emb.dims

        string = "<" + str(
            datetime.datetime.now()) + ">  " + "Embedding " + str(x)
        print(string, end="\r")

        for wi, w in enumerate(word_id):
            m = np.zeros([emb.dims], dtype=float)
            try:
                m = emb.word_to_vector(w)
            except KeyError as r:
                pass

            matrix.append(m)

            if wi % 1000 == 0:
                string = ("<" + str(datetime.datetime.now()) + "> " +
                          "Calculating meta embeddind for embedding " +
                          str(x) + ": " + str(int(100 * wi / len(word_id))) +
                          "%")
                print(string, end="\r")
        print()

        with open("tmp_conc/" + str(x), "w+", encoding="utf-8") as file:
            for wi, w in enumerate(word_id):
                if first_emb:
                    print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)
                else:
                    print(" ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)

                if wi % 1000 == 0:
                    string = ("<" + str(datetime.datetime.now()) + "> " +
                              "Saving embedding " + str(x) + " to file : " +
                              str(int(100 * wi / len(word_id))) + "%")
                    print(string, end="\r")

            print()

        first_emb = False

    printTrace("Concatenation...")

    excec_com = "paste -d ' ' "
    for x in range(len(embeddings_path)):
        excec_com = excec_com + "tmp_conc/" + str(x) + " "
    excec_com = excec_com + "> " + str(out_path)
    print(excec_com)
    os.system(excec_com)

    excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) +
                 "\\n/' " + str(out_path))
    print(excec_com)
    os.system(excec_com)

    try:
        shutil.rmtree("/tmp_conc")
    except:
        print("Could not delete the tmp folder, do it manually")

    printTrace("Done. Meta embedding saved in " + str(out_path))