コード例 #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--embedding', required=True)
    parser.add_argument('-m', '--method', choices=['PCA', 'tSVD', 'DRA'], default='DRA')
    parser.add_argument('-o', '--output', required=True)
    parser.add_argument('-n', '--reduce_to', type=int, default=300)
    parser.add_argument('-b', '--do_in_batches', action='store_true')
    parser.add_argument('-nb', '--batch_size', type=int, default=1024)
    args = parser.parse_args()

    emb = load_embedding(args.embedding, lower=False, length_normalize=False, delete_duplicates=True)

    if args.method == 'PCA':
        if args.do_in_batches:
            emb.vectors = PPA_batches(emb.vectors, args.reduce_to, args.batch_size)
        else:
            emb.vectors = PCA(emb.vectors, args.reduce_to)

    elif args.method == 'tSVD':
        emb.vectors = T_SVD(emb.vectors, args.reduce_to)

    elif args.method == 'DRA':
        if args.do_in_batches:
            emb.vectors = DRA_batches(emb.vectors, args.reduce_to, args.batch_size)
        else:
            emb.vectors = DRA(emb.vectors, args.reduce_to)

    else:
        raise ValueError(str(args.method) + ' reduction method not supported. Reduction method supported: PCA, tSVD, DRA')

    emb.export(args.output)
def emb_converter(path_input, path_output, args):
    printTrace('Loading Embedding ' + str(path_input) + '...')
    format = 'bin' if path_input.split('/')[-1].split('.')[-1] == 'bin' else 'text'

    emb = load_embedding(path_input, format=format,
                             vocabulary=None if args.vocab is None else vocab_from_path(args.vocab),
                             length_normalize=args.length_normalize,
                             normalize_dimensionwise=args.normalize_dimensionwise, to_unicode=True,
                             lower=args.lower, path2='', delete_duplicates=True, method_vgg="delete")

    printTrace('Saving result to ' + str(path_output) + '...')

    num_words = 0
    with open(path_output, 'w+') as file:
        for i_word, word in enumerate(emb.words):

            if i_word % 5000 ==0:
                string = "<" + str(datetime.datetime.now()) + ">  " + 'Converting : ' + str(
                    int(100 * i_word / len(emb.words))) + '%'
                print(string, end="\r")
            if args.language is None or any(l in word.split(args.delimiter) for l in args.language):
                print(word.split(args.delimiter)[-1] + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]), file=file)
                num_words+=1

    print()

    if args.word2vec:
        excec_com = 'sed -i \'1s/^/' + str(num_words) + ' ' + str(emb.dims) + '\\n/\' ' + str(path_output)
        print(excec_com)
        os.system(excec_com)

    printTrace('Done.')
コード例 #3
0
def kmeans4embedding(embedding_path, output_path, k, metric, batch_size):
    printTrace('Loading embedding ' + str(embedding_path))

    emb = load_embedding(embedding_path, lower=False, length_normalize=False, delete_duplicates=True)

    printTrace('Clustering for embedding ' + str(embedding_path))

    labels = doKmeans(emb.vectors, k, metric, batch_size)

    printTrace('Printing clusters for embedding ' + str(embedding_path))

    with open(output_path, 'w') as file:
        for i_label, label in enumerate(labels):
            print(emb.vocabulary.index_to_word(i_label) + ' ' + str(label), file=file)

    printTrace('Sorting clusters for embedding ' + str(embedding_path))

    excec_com = 'sort -k2 -n ' + str(output_path) + ' > ' + str(output_path) + '_sorted'
    print(excec_com)
    os.system(excec_com)
    excec_com = 'rm ' + str(output_path)
    print(excec_com)
    os.system(excec_com)
    excec_com = 'mv ' + str(output_path) + '_sorted ' + str(output_path)
    print(excec_com)
    os.system(excec_com)

    printTrace('Done, clusters saved in ' + str(output_path))
コード例 #4
0
 def __init__(
     self,
     embedding_path='/home/iker/Documents/QuestionCluster/TechEmbeddings/embeddings_lower.vec'
 ):
     self.questions = []
     self.questions_normalized = []
     self.questions_vectors = []
     self.keywords = collections.defaultdict()
     self.embedding = load_embedding(embedding_path)
コード例 #5
0
def main():
    parser = argparse.ArgumentParser()
    inputtype = parser.add_mutually_exclusive_group(required=True)
    inputtype.add_argument('-i', '--embedding', type=str)
    inputtype.add_argument('-d', '--directory', type=str)

    parser.add_argument('-b', '--batch_size', type=int, default=512)
    parser.add_argument('-dic',
                        '--dictionary_path',
                        type=str,
                        default='DictionaryInductionDataset/es-en.test')
    parser.add_argument('-p', '--add_lang_prefix', action='store_true')

    args = parser.parse_args()

    emb_list = []
    if args.embedding is not None:
        emb_list.append(args.embedding)
    else:
        emb_list = [
            os.path.join(args.directory, f) for f in os.listdir(args.directory)
            if os.path.isfile(os.path.join(args.directory, f))
        ]

    if not os.path.exists('Results'):
        os.makedirs('Results')

    for emb_i, emb_path in enumerate(emb_list):

        printTrace('Evaluating Embedding ' + str(emb_i + 1) + ' of ' +
                   str(len(emb_list)) + ' : ' + str(emb_path))
        emb = load_embedding(emb_path,
                             lower=False,
                             length_normalize=True,
                             delete_duplicates=True)

        top1, top2, top3, top5, top10, coverage = evaluate_dictionary_induction(
            emb, args.dictionary_path, args.batch_size, emb_path,
            args.add_lang_prefix)

        with open('Results/dictionary_induction', 'a+') as file:
            print(','.join([
                str(emb_path),
                str(top1),
                str(top2),
                str(top3),
                str(top5),
                str(top10),
                str(coverage)
            ]),
                  file=file)

    print('Results have been exported in csv format to the Results folder')
コード例 #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--embedding', required=True)
    parser.add_argument('-n', '--name_of_embedding', default=None)
    parser.add_argument('-l', '--lowercase_dataset', action='store_true')
    parser.add_argument('-d',
                        '--dataset_path',
                        default='AnalogyDataset/questions-words.txt')
    parser.add_argument('-b', '--batch_size', type=int, default=512)

    args = parser.parse_args()

    print(' >>> loading embedding <<< ')
    emb = load_embedding(args.embedding,
                         lower=False,
                         length_normalize=True,
                         delete_duplicates=True)
    name = args.embedding if args.name_of_embedding is None else args.name_of_embedding

    if not os.path.exists('Results'):
        os.makedirs('Results')

    print('>>> Results deleting oov <<< ')

    results = evaluate_analogy(emb.vocabulary.word_id,
                               emb.vectors,
                               dataset_path=args.dataset_path,
                               lowercase=args.lowercase_dataset,
                               BATCH_SIZE=args.batch_size)
    print_to_csv_analogy(results,
                         name=name,
                         filenameResults='Results/Analogy_Results_delete.csv')

    print()
    print()
    print('>>> Result using mean of all word vectors as OOV <<<')

    #emb.vocabulary.word_id['<<UKN>>'] = len(emb.words)
    emb.vectors = np.append(emb.vectors, [np.mean(emb.vectors, axis=0)],
                            axis=0)

    results = evaluate_analogy(emb.vocabulary.word_id,
                               emb.vectors,
                               dataset_path=args.dataset_path,
                               lowercase=args.lowercase_dataset,
                               BATCH_SIZE=args.batch_size,
                               backoff_vector=len(emb.vectors) - 1)
    print_to_csv_analogy(results,
                         name=name,
                         filenameResults='Results/Analogy_Results_mean.csv')

    print('Results have been exported in csv format to the Results folder')
コード例 #7
0
def benchmark():
    global device

    get_files()
    print("Running Benchmark..")
    time = datetime.datetime.now()

    emb = load_embedding('RWSGwn.emb',
                         length_normalize=False,
                         delete_duplicates=True)
    time = print_time('Loading embedding from Disk to RAM step', time)

    emb.length_normalize()
    time = print_time(
        'Embedding length normalization step (' + CPUcolor + 'CPU' +
        RESETcolor + ')', time)

    vocab_to_search = emb.words
    for i in range(100):
        for word in vocab_to_search:
            v = emb.word_to_vector(word)
    time = print_time(
        'Searching for vocabulary step (' + CPUcolor + 'CPU' + RESETcolor +
        ')', time)

    m = emb.vectors
    M = emb.vectors

    for i_batch, mb in enumerate(batch(m, batch_size)):
        _ = matrix_dot(mb, M)

    time = print_time(
        'Matrix dot product step step ' +
        ('(' + CPUcolor + 'CPU' + RESETcolor + ')' if device == 'CPU' else
         '(' + GPUcolor + 'GPU' + RESETcolor + ')'), time)

    for i_batch, mb in enumerate(batch(m, batch_size)):
        _ = cosine_knn(mb, M, 10)

    time = print_time(
        'Searching for nearest neighbors step ' +
        ('(' + CPUcolor + 'CPU' + RESETcolor + ')' if device == 'CPU' else
         '(' + GPUcolor + 'GPU' + RESETcolor + ')'), time)

    emb.export('temp.emb')
    time = print_time('Exporting embedding from RAM to Disk step', time)

    os.remove("temp.emb")
    print()
    print("Benchmark is over.")
コード例 #8
0
    def __init__(self, dictname, wordvectdim):

        print('Loading ' + dictname + '...(This might take one or two minutes.)')
        self.wordtoindex   = dict()
        self.indextovector = []
        self.indextovector.append(np.zeros(wordvectdim))

        emb = load_embedding(dictname, length_normalize=False, to_unicode=False, lower=False, delete_duplicates=True)

        for i_word, word in enumerate(emb.words):
            self.wordtoindex.update([(word, i_word+1)])
            self.indextovector.append(emb.word_to_vector(word))

        del emb
        #print(self.indextovector.shape)
        self.indextovector = np.array(self.indextovector, dtype='float32')
コード例 #9
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--embedding', required=True)
    parser.add_argument('-l', '--search_words', required=True)
    parser.add_argument('-o', '--output', required=True)
    parser.add_argument('-b', '--batch_size', type=int, default=1024)
    parser.add_argument('-k', '--num_nearest_neighbor', type=int, default=10)

    args = parser.parse_args()

    emb = load_embedding(args.embedding,
                         vocabulary=None,
                         lower=False,
                         length_normalize=True,
                         normalize_dimensionwise=False,
                         delete_duplicates=True)

    words_2_search = vocab_from_path(args.search_words)

    m = emb.words_to_matrix(words_2_search)
    M = emb.words_to_matrix(emb.words)

    nn = []

    for i_batch, mb in enumerate(batch(m, args.batch_size)):

        string = "<" + str(
            datetime.datetime.now()) + ">  " + 'Calculating nn words  ' + str(
                int(100 * (i_batch * args.batch_size) / len(m))) + '%'
        print(string, end="\r")

        result = cosine_knn(mb, M, args.num_nearest_neighbor)

        for i_result, indexes in enumerate(result):
            nn.append(["\"" + emb.words[i] + "\"" for i in indexes])

    file = open(args.output, 'w+', encoding='utf-8')

    for word, nns in zip(words_2_search, nn):
        print(word + ': ' + ' '.join(nns), file=file)
コード例 #10
0
def emb_converter(path_input, path_output, args):
    printTrace('Loading Embedding ' + str(path_input) + '...')
    format = 'bin' if path_input.split('/')[-1].split(
        '.')[-1] == 'bin' else 'text'

    emb = load_embedding(
        path_input,
        format=format,
        vocabulary=None if args.vocab is None else vocab_from_path(args.vocab),
        length_normalize=args.length_normalize,
        normalize_dimensionwise=args.normalize_dimensionwise,
        to_unicode=True,
        lower=args.lower,
        path2='',
        delete_duplicates=True,
        method_vgg="delete")

    printTrace('Saving result to ' + str(path_output) + '...')
    emb.export(path=path_output, printHeader=args.word2vec)

    printTrace('Done.')
コード例 #11
0
    print('Train: %d | Valid: %d | Test: %d' %
          (len(train), len(valid), len(test)))
    train_engine = DataLoader(data.DataEngine(train, vocabulary, pad_lens),
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=use_cuda)
    valid_engine = DataLoader(data.DataEngine(valid, vocabulary, pad_lens),
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=use_cuda)
    test_engine = data.DataEngine(test, vocabulary, pad_lens)

    if args.init_embedding:
        w2v = load_embedding(args.embedding_source, vocabulary.to_idx, 300)
    else:
        w2v = None
    fusion_net = FusionNet(vocab_size=len(vocabulary),
                           word_dim=300,
                           hidden_size=125,
                           rnn_layer=args.rnn_layer,
                           dropout=args.dropout,
                           pretrained_embedding=w2v)
    if use_cuda:
        fusion_net = fusion_net.cuda()

    criterion = nn.CrossEntropyLoss()

    optimizer = optim.Adamax(fusion_net.parameters())
コード例 #12
0
Joint_path = '../../Embeddings/'

print("====ENGLISH-SPANISH===")

words_eng = []
words_eng.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENES.emb'))
words_eng.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENIT.emb'))
english_words = list(set.intersection(*words_eng))

words_es = []
words_es.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENES.emb'))
words_es.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ESIT.emb'))
spanish_words = list(set.intersection(*words_es))

emb = load_embedding(Joint_path + 'JOINTC-HYB-ENES.emb',
                     length_normalize=False,
                     delete_duplicates=True)

with open('../../Embeddings/separated/JointENES.vec', 'w') as file:

    print(str(len(spanish_words) + len(english_words)) + ' 300', file=file)

    for word in english_words:
        print('en/' + word + ' ' +
              ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]),
              file=file)
    for word in spanish_words:
        print('es/' + word + ' ' +
              ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]),
              file=file)
コード例 #13
0
ファイル: sentinet.py プロジェクト: JayChzp/SentiNet
    # run the test for SemEval2016
    if args.semeval2016 or args.all:
        lr = 1e-3
        weight_decay = 0
        embed_lr = 1e-2
        embed_weight_decay = 1e-4

        data = dataset.load.SemEval(year="2016",
                                    leave_dev=True,
                                    fraction=0.1,
                                    seed=args.seed)
        print("number of train data:{}".format(len(data['train'][0])))
        print("number of dev data:{}".format(len(data['dev'][0])))
        print("number of test data:{}".format(len(data['test'][0])))
        word_list = dataset.load.get_word_list(data['train'][0])
        embedding_matrix, token_to_idx, idx_to_token = embedding.load_embedding(
            args.embedding_path, word_list, isBinary=args.isBinary)

        obj_value, result = run_exp(data,
                                    embedding_matrix,
                                    token_to_idx,
                                    seed=args.seed,
                                    weight_decay=weight_decay,
                                    lr=lr,
                                    max_len=args.max_len,
                                    batch_size=50,
                                    obj='loss',
                                    embed_lr=embed_lr,
                                    epoches=100,
                                    embed_weight_decay=embed_weight_decay)

        print("{}\t{}\t{}".format(obj_value, result['macro_f1'],
コード例 #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--embedding", type=str, required=True)
    parser.add_argument("-c", "--emb_4_generation", type=str, required=True)
    parser.add_argument("-d", "--dataset", type=str, required=True)
    parser.add_argument("-b", "--batch_size", type=int, default=1024)
    parser.add_argument("-k", "--num_nearest_neighbor", type=int, default=10)

    args = parser.parse_args()

    dims = get_dimensions(args.embedding)

    if dims != get_dimensions(args.emb_4_generation):
        raise ValueError(
            "All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format"
        )

    printTrace("Reading vocab...")

    vocab_emb = vocab_from_path(args.embedding)
    vocab_cross = vocab_from_path(args.emb_4_generation)
    dataset = get_dataset(args.dataset)
    vocab_to_generate = list(
        set(np.append((dataset.X[:, 0]), (dataset.X[:, 1]))))
    vocab_to_generate_set = set(vocab_to_generate)
    vocab_emb_delete = [x for x in vocab_emb if x not in vocab_to_generate_set]

    total_vocab = set.union(set(vocab_emb_delete), set(vocab_cross))
    interset_vocab = list(
        set.intersection(set(vocab_emb_delete), set(vocab_cross)))

    print("Final embedding will have " + str(len(total_vocab)) + " words")
    print("We will generate " + str(len(vocab_to_generate)) + " words")

    emb = load_embedding(
        args.emb_4_generation,
        vocabulary=None,
        lower=False,
        length_normalize=True,
        normalize_dimensionwise=False,
        delete_duplicates=True,
    )

    m = emb.words_to_matrix(vocab_to_generate)
    M = emb.words_to_matrix(interset_vocab)

    nn = []

    for i_batch, mb in enumerate(batch(m, args.batch_size)):

        string = ("<" + str(datetime.datetime.now()) + ">  " +
                  "Using Embedding " + str(args.emb_4_generation) +
                  " to generate vocab for Embedding " + str(args.embedding) +
                  ":  " + str(int(100 *
                                  (i_batch * args.batch_size) / len(m))) + "%")
        print(string, end="\r")

        # print(np.asarray(mb).shape)
        # print(np.asarray(M).shape)

        result = cosine_knn(mb, M, args.num_nearest_neighbor)

        for i_result, indexes in enumerate(result):
            nn.append([interset_vocab[i] for i in indexes])

    del emb

    printTrace("===> Generating new_vocab <===")

    emb = load_embedding(
        args.embedding,
        vocabulary=vocab_emb_delete,
        lower=False,
        length_normalize=False,
        normalize_dimensionwise=False,
        delete_duplicates=True,
    )

    new_vectors = []
    for i_word, word in enumerate(vocab_to_generate):
        if i_word % 1000 == 0:
            string = ("<" + str(datetime.datetime.now()) + ">  " +
                      "Generating vocab " + ": " +
                      str(int(100 * i_word / len(vocab_to_generate))) + "%")
            print(string, end="\r")

        try:
            lw = nn[i_word]
            v = np.zeros([dims], dtype=float)
            for word_nn in lw:
                v += emb.word_to_vector(word_nn)

        except KeyError as r:
            raise ValueError(
                "Something went wrong in the word generation process")

        new_vectors.append(v / args.num_nearest_neighbor)

    print()

    del emb

    printTrace("===> Loading embeddings to compare <===")
    emb_generated = Embedding(vocabulary=Vocabulary(vocab_to_generate),
                              vectors=new_vectors)
    emb_original = load_embedding(
        args.embedding,
        vocabulary=vocab_to_generate,
        lower=False,
        length_normalize=False,
        normalize_dimensionwise=False,
        delete_duplicates=True,
    )

    printTrace("===> Evaluate <===")

    print("Original Embedding: ", end="")
    print(
        similarity_emd(
            emb_original,
            dataset.X,
            dataset.y,
            backoff_vector=None,
            lower=False,
            lang1prefix=None,
            lang2prefix=None,
        ))
    print("Generated Embedding: ", end="")
    print(
        similarity_emd(
            emb_generated,
            dataset.X,
            dataset.y,
            backoff_vector=None,
            lower=False,
            lang1prefix=None,
            lang2prefix=None,
        ))
コード例 #15
0
ファイル: main.py プロジェクト: lihebi/scratch
def main():
    """Steps:
    1. preprocess data:
      - tokenization (sentence tokenizer)
      - separate article and reference summary
      - chunk into train and test

    2. data generation: for each reference summary, do the following
    mutation operations: deletion, insertion, mutation. According to
    how much are changed, assign a score.
    
    3. sentence embedding: embed article and summary into sentence
    vectors. This is the first layer, the embedding layer. Then, do a
    padding to get the vector to the same and fixed dimension
    (e.g. summary 20, article 100). FIXME what to do for very long
    article? Then, fully connected layer directly to the final result.

    """

    # data v1
    (x_train, y_train), (x_val, y_val) = prepare_data()
    # data v2
    articles, summaries, scores = load_text_data()
    # this is pretty time consuming, so save it
    tokenizer = prepare_tokenizer(articles + summaries)
    # alternatively, save and load. Note that you must ensure to fit
    # on the same text.
    save_tokenizer(tokenizer)
    tokenizer = load_tokenizer()
    # this is also slow
    (x_train, y_train), (x_val,
                         y_val) = prepare_data_v2(articles, summaries, scores,
                                                  tokenizer)
    # save and load the data
    # save_data(x_train, y_train, x_val, y_val)
    # (x_train, y_train), (x_val, y_val) = load_data()

    x_train.shape
    y_train.shape
    x_val.shape
    y_val.shape
    # model v1
    model = build_model()
    # model v2
    embedding_layer = load_embedding(tokenizer)
    model = build_glove_model(embedding_layer)

    # training op
    optimizer = tf.train.RMSPropOptimizer(0.001)
    # optimizer=tf.train.AdamOptimizer(0.01)
    model.compile(
        optimizer=optimizer,
        # loss='binary_crossentropy',
        loss='mse',
        # metrics=['accuracy']
        metrics=['mae'])
    model.fit(x_train,
              y_train,
              epochs=40,
              batch_size=128,
              validation_data=(x_val, y_val),
              verbose=1)
    model.summary()
コード例 #16
0
def main():
    '''
    Main function that coordinates the entire process. Parses arguments that specify the exercise and the
    experiment that should be run. Initializes the model and the checkpoint managers.
    '''

    parser = argparse.ArgumentParser(
        description='Define configuration of experiments')
    parser.add_argument('--mode',
                        type=str,
                        nargs='+',
                        choices=['train', 'evaluate', 'generate'],
                        required=True)
    parser.add_argument('--experiment',
                        type=str,
                        choices=['a', 'b', 'c'],
                        required=True)
    parser.add_argument('--id', type=str, required=False)
    parser.add_argument('--epochs', type=int, default=EPOCHS, required=False)

    args = parser.parse_args()

    # Setting Experiment Id
    if args.id is None:
        exp_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        print(f"No Experiment Id Set, Creating New: {exp_id}")
    else:
        exp_id = args.id
        print(f"Using Experiment Id: {exp_id}")

    # Setting Directories
    base_dir = f"{OUTPUT_DIR}/exp_{args.experiment}/{exp_id}"
    log_dir = f"{base_dir}/logs"
    submission_dir = f"{base_dir}/submissions"
    if not os.path.exists(submission_dir):
        os.makedirs(submission_dir)
    ckpt_dir = f"{base_dir}/ckpts"

    print(f"Experiment Directory: {base_dir}")

    print(f"Using Tensorflow Version: {tf.__version__}")
    print("Building Vocabulary...")
    build_vocab(input_file=PATH_TRAIN,
                output_file=PATH_VOCAB,
                top_k=VOCAB_SIZE,
                special=SPECIAL)
    word2id, id2word = build_vocab_lookup(PATH_VOCAB, "<unk>")

    # Setting Experiment Specific Configurations
    if args.experiment == 'a':
        lstm_hidden_state_size = 512
        word_embeddings = None

    elif args.experiment == 'b':
        lstm_hidden_state_size = 512
        word_embeddings = load_embedding(dim_embedding=EMBEDDING_SIZE,
                                         vocab_size=VOCAB_SIZE)

    elif args.experiment == 'c':
        lstm_hidden_state_size = 1024
        word_embeddings = load_embedding(dim_embedding=EMBEDDING_SIZE,
                                         vocab_size=VOCAB_SIZE)
    else:
        raise ValueError(f"Unknown Experiment {args.experiment}")

    print(f'Initializing Model...')
    model = LanguageModel(vocab_size=VOCAB_SIZE,
                          sentence_length=SENTENCE_LENGTH,
                          embedding_size=EMBEDDING_SIZE,
                          hidden_state_size=lstm_hidden_state_size,
                          output_size=LSTM_OUTPUT_SIZE,
                          batch_size=BATCH_SIZE,
                          word_embeddings=word_embeddings,
                          index_to_word_table=id2word)

    print(f'Initializing Optimizer...')
    optimizer = tf.keras.optimizers.Adam()

    ckpt = tf.train.Checkpoint(step=tf.Variable(1),
                               optimizer=optimizer,
                               net=model)
    manager = tf.train.CheckpointManager(ckpt, ckpt_dir, max_to_keep=5)

    if manager.latest_checkpoint:
        print(f"Restoring Model from {manager.latest_checkpoint}...")
        ckpt.restore(manager.latest_checkpoint)
        model_loaded = True
    else:
        print("Initializing Model from Scratch")
        model_loaded = False

    if "train" in args.mode:
        print(f"Starting Training...")
        train_summary_writer = tf.summary.create_file_writer(
            f"{log_dir}/train")
        with train_summary_writer.as_default():
            train(ckpt=ckpt,
                  manager=manager,
                  model=model,
                  optimizer=optimizer,
                  word2id=word2id,
                  id2word=id2word,
                  epochs=args.epochs)
        model_loaded = True

    if "evaluate" in args.mode:
        print(f"Starting Evaluation...")
        assert model_loaded, 'model must be loaded from checkpoint in order to be evaluated'

        test_summary_writer = tf.summary.create_file_writer(
            f"{log_dir}/evaluate")
        with test_summary_writer.as_default():
            evaluate(
                model=model,
                word2id=word2id,
                id2word=id2word,
                step=optimizer.iterations,
                path_submission=
                f"{submission_dir}/group35.perplexity{args.experiment.upper()}"
            )

    if "generate" in args.mode:
        print(f"Starting Generation...")
        assert model_loaded, 'model must be loaded from checkpoint in order to start generation'

        generate_summary_writer = tf.summary.create_file_writer(
            f"{log_dir}/generate")
        with generate_summary_writer.as_default():
            generate(word2id,
                     id2word,
                     model=model,
                     path_submission=f"{submission_dir}/group35.continuation")
コード例 #17
0
def concatenate_embeddings_generate(embeddings_path,
                                    out_path,
                                    vocab=None,
                                    batch_size=1024,
                                    k=10):
    printTrace("Reading vocab...")

    # [[vocab_emb1], [vocab_emb_2], ...]
    vocab_embeddings = [vocab_from_path(x) for x in embeddings_path]

    word_id = set()

    if vocab is None:
        word_id = list(set.union(*vocab_embeddings))
    else:
        word_id = set(vocab)
        union = set.union(*vocab_embeddings)
        [
            print("Word " + str(w) + " not found in any embedding")
            for w in word_id - union
        ]
        word_id = list(word_id.intersection(union))

    print("The final embedding will have " + str(len(word_id)) + " words.")

    for i_voc, voc in enumerate(vocab_embeddings):
        print("Embedding " + str(i_voc) + " has " + str(len(voc)) + " words.")
        print("We will generate " + str(len(set(word_id) - voc)) +
              " words for the embedding " + str(i_voc))

    print()

    printTrace("Building matrix for word generation...")
    generation_vocab_matrix = [[x for x in range(len(embeddings_path))]
                               for x in range(len(embeddings_path))]
    nn_vocab = [defaultdict() for x in range(len(embeddings_path))]

    for x, emb1 in enumerate(vocab_embeddings):
        vocab_to_generate = set(word_id) - emb1
        for y, emb2 in enumerate(vocab_embeddings):
            generation_vocab_matrix[y][x] = list(
                vocab_to_generate.intersection(emb2))
            vocab_to_generate = vocab_to_generate - emb2

    printTrace("===> Calculating nearest neighbors <===")

    for i_emb_path, emb_path in enumerate(embeddings_path):

        printTrace("Loading file: " + str(emb_path))
        emb = load_embedding(
            emb_path,
            vocabulary=None,
            length_normalize=True,
            normalize_dimensionwise=False,
            delete_duplicates=True,
        )

        for i_g, g in enumerate(generation_vocab_matrix[i_emb_path]):
            if len(g) > 0:
                # print('G: ' + str(g))
                m = emb.words_to_matrix(
                    g)  # generation_vocab_matrix[i_emb_path][i_g])

                # print(len(m))
                # print(generation_vocab_matrix[x][gi])

                interset_vocab = list(
                    set.intersection(vocab_embeddings[i_emb_path],
                                     vocab_embeddings[i_g]))

                M = emb.words_to_matrix(interset_vocab)

                total_words = len(m)

                for i_batch, mb in enumerate(batch(m, batch_size)):

                    string = (
                        "<" + str(datetime.datetime.now()) + ">  " +
                        "Using Embedding " + str(i_emb_path) +
                        " to generate vocab for Embedding " + str(i_g) +
                        ":  " +
                        str(int(100 *
                                (i_batch * batch_size) / total_words)) + "%")
                    print(string, end="\r")

                    result = cosine_knn(mb, M, k)
                    for i_result, indexes in enumerate(result):
                        nn_vocab[i_g][g[i_result + (batch_size * i_batch)]] = [
                            interset_vocab[i] for i in indexes
                        ]

                print()

    printTrace("===> Calculating meta embedding <===")

    total_words = len(word_id)
    first_emb = True

    if not os.path.exists("tmp"):
        os.makedirs("tmp")

    total_dims = 0

    for x, emb_path in enumerate(embeddings_path):
        matrix = []
        printTrace("Loading file: " + str(emb_path))

        emb = load_embedding(
            emb_path,
            vocabulary=None,
            length_normalize=True,
            normalize_dimensionwise=False,
            delete_duplicates=True,
        )

        total_dims += emb.dims

        string = "<" + str(
            datetime.datetime.now()) + ">  " + "Embedding " + str(x)
        print(string, end="\r")

        actual_matrix = []

        for wi, w in enumerate(word_id):
            m = np.zeros([emb.dims], dtype=float)
            try:
                m = emb.word_to_vector(w)
            except KeyError as r:
                try:
                    lw = nn_vocab[x][w]
                    v = np.zeros([emb.dims], dtype=float)
                    for word in lw:
                        v += emb.word_to_vector(word)

                except KeyError as r:
                    raise ValueError(
                        "Something went wrong in the word generation process")

                m = normalize_vector(v / k)

            matrix.append(m)

            if wi % 1000 == 0:
                string = ("<" + str(datetime.datetime.now()) + "> " +
                          "Calculating meta embeddind for embedding " +
                          str(x) + ": " + str(int(100 * wi / total_words)) +
                          "%")
                print(string, end="\r")
        print()

        with open("tmp/" + str(x), "w") as file:
            for wi, w in enumerate(word_id):
                if first_emb:
                    print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)
                else:
                    print(" ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)

                if wi % 1000 == 0:
                    string = ("<" + str(datetime.datetime.now()) + "> " +
                              "Saving embedding " + str(x) + " to file : " +
                              str(int(100 * wi / total_words)) + "%")
                    print(string, end="\r")

            print()

        first_emb = False

    printTrace("Concatenation...")

    excec_com = "paste -d ' ' "
    for x in range(len(embeddings_path)):
        excec_com = excec_com + "tmp/" + str(x) + " "
    excec_com = excec_com + "> " + str(out_path)
    print(excec_com)
    os.system(excec_com)

    excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) +
                 "\\n/' " + str(out_path))
    print(excec_com)
    os.system(excec_com)

    try:
        os.system("rm -rf tmp")
    except:
        print("Could not delete the tmp folder, do it manually")

    printTrace("Done. Meta embedding saved in " + str(out_path))
コード例 #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--embedding', required=True)
    parser.add_argument('-c', '--cross_embedding', required=True)
    parser.add_argument('-o', '--output', required=True)
    parser.add_argument('-b', '--batch_size', type=int, default=1024)
    parser.add_argument('-k', '--num_nearest_neighbor', type=int, default=10)


    args = parser.parse_args()


    dims = get_dimensions(args.embedding)

    if dims != get_dimensions(args.cross_embedding):
        raise ValueError('All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format')

    printTrace('Reading vocab...')

    vocab_emb = vocab_from_path(args.embedding)
    vocab_cross = vocab_from_path(args.cross_embedding)

    total_vocab = set.union(set(vocab_emb), set(vocab_cross))
    interset_vocab = list(set.intersection(set(vocab_emb), set(vocab_cross)))
    vocab_to_generate = set(vocab_cross) - set(vocab_emb)

    print('Final embedding will have ' + str(len(total_vocab)) + ' words')
    print('We will generate ' + str(len(vocab_to_generate)) + ' words')

    emb = load_embedding(args.cross_embedding, vocabulary=None, lower=False, length_normalize=True, normalize_dimensionwise=False,
                         delete_duplicates=True)

    m = emb.words_to_matrix(vocab_to_generate)

    M = emb.words_to_matrix(interset_vocab)

    nn=[]

    for i_batch, mb in enumerate(batch(m, args.batch_size)):

        string = "<" + str(datetime.datetime.now()) + ">  " + 'Using Embedding ' + str(
            args.cross_embedding) + ' to generate vocab for Embedding ' + str(args.embedding) + ':  ' + str(
            int(100 * (i_batch * args.batch_size) / len(m))) + '%'
        print(string, end="\r")

        # print(np.asarray(mb).shape)
        # print(np.asarray(M).shape)


        result = cosine_knn(mb, M, args.num_nearest_neighbor)

        for i_result, indexes in enumerate(result):
            nn.append([interset_vocab[i] for i in indexes])

    del emb


    printTrace('===> Generating new_vocab <===')

    emb = load_embedding(args.embedding, vocabulary=None, lower=False, length_normalize=False, normalize_dimensionwise=False,
                         delete_duplicates=True)



    new_vectors = []
    for i_word, word in enumerate(vocab_to_generate):
        if i_word%1000 == 0:
            string = "<" + str(datetime.datetime.now()) + ">  " + 'Generating vocab ' + args.output + ': ' + str(
                int(100 * i_word / len(vocab_to_generate))) + '%'
            print(string, end="\r")

        try:
            lw = nn[i_word]
            v = np.zeros([dims], dtype=float)
            for word_nn in lw:
                v += emb.word_to_vector(word_nn)

        except KeyError as r:
            raise ValueError('Something went wrong in the word generation process')

        new_vectors.append(v/args.num_nearest_neighbor)

    print()


    printTrace('===> Printing to file <===')

    with open(args.output,'w') as file:

        print(str(len(emb.words)+len(vocab_to_generate)) + ' ' + str(dims),file=file)

        for w in emb.words:
            print(w + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(w)]), file=file)

        for w_i, w in enumerate(vocab_to_generate):
            print(w + ' ' + ' '.join(['%.6g' % x for x in new_vectors[w_i]]), file=file)
コード例 #19
0
import sys
import argparse
sys.path.insert(0, '../')
from embedding import load_embedding

parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', required=True)
parser.add_argument('-o', '--output', required=True)
args = parser.parse_args()

emb = load_embedding(args.input,
                     length_normalize=False,
                     delete_duplicates=True)

with open(args.output, 'w+') as file:
    print(str(len(emb.words)) + ' ' + str(emb.dims), file=file)

    for word in emb.words:
        print(''.join(word.split('/')[1:]) + ' ' +
              ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]),
              file=file)
コード例 #20
0
ファイル: views.py プロジェクト: eagalvez/ustat_backend
logging.basicConfig(format=fmt, level=lvl)
logging.debug("Logging started on %s for %s" %
              (logging.root.name, logging.getLevelName(lvl)))

# nltk.download('punkt')
# nltk.download('stopwords')

API_KEY = 'AIzaSyB-OHHc_t1PlT2TbiQY67-yUuAe7tqotfg'

ruta = os.path.join(os.path.abspath("."), "Consultas\\Embeddings")
ruta = os.path.join(ruta, "esTech_enTech_1.vec")
# EMBEDDING_PATH = '.\Embeddings\esTech_enTech_1.vec'
EMBEDDING_PATH = ruta

embedding = load_embedding(EMBEDDING_PATH)
QM2 = Question_Manager(embedding)

K_CLUSTERS = 3

MAX_COMMENTS = 50


def transformarListaComentarios_Json(ListaDeListas):
    ListaFinal = []
    TemasNumeros = list(range(1, len(ListaDeListas) + 1))
    for indice, Lista in enumerate(ListaDeListas):
        Dicc = {}
        data = []
        Dicc["topic name"] = "Tema #" + str(TemasNumeros[indice])
        for i, element in enumerate(Lista):
コード例 #21
0
import sys
import argparse
sys.path.insert(0, '../')
from embedding import load_embedding

parser = argparse.ArgumentParser()
parser.add_argument('-i', '--embeddings', nargs='+', required=True)
parser.add_argument('-o', '--output', required=True)
args = parser.parse_args()

embs = []
total_words = 0
dims = 0
for emb in args.embeddings:
    embs.append(
        load_embedding(emb, length_normalize=False, delete_duplicates=True))

for emb in embs:
    total_words += len(emb.words)

dims = embs[0].dims

for emb in embs:
    if emb.dims != dims:
        raise ValueError(
            'All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format'
        )

with open(args.output, 'w+', encoding='utf-8') as file:
    print(str(total_words) + ' ' + str(dims), file=file)
コード例 #22
0
def concatenate_embeddings(
    embeddings_path,
    out_path,
    vocab,
):
    printTrace("===> Calculating meta embedding (No OOV) <===")

    vocab_embeddings = [vocab_from_path(x) for x in embeddings_path]

    if vocab is None:
        word_id = list(set.union(*vocab_embeddings))
    else:
        word_id = set(vocab)
        union = set.union(*vocab_embeddings)
        [
            print("Word " + str(w) + " not found in any embedding")
            for w in word_id - union
        ]
        word_id = list(word_id.intersection(union))

    print("The final embedding will have " + str(len(word_id)) + " words.")

    first_emb = True

    if not os.path.exists("tmp_conc"):
        os.makedirs("tmp_conc")

    total_dims = 0

    for x, emb_path in enumerate(embeddings_path):
        matrix = []
        printTrace("Loading file: " + str(emb_path))

        emb = load_embedding(
            emb_path,
            vocabulary=None,
            length_normalize=True,
            normalize_dimensionwise=False,
            delete_duplicates=True,
        )

        total_dims += emb.dims

        string = "<" + str(
            datetime.datetime.now()) + ">  " + "Embedding " + str(x)
        print(string, end="\r")

        for wi, w in enumerate(word_id):
            m = np.zeros([emb.dims], dtype=float)
            try:
                m = emb.word_to_vector(w)
            except KeyError as r:
                pass

            matrix.append(m)

            if wi % 1000 == 0:
                string = ("<" + str(datetime.datetime.now()) + "> " +
                          "Calculating meta embeddind for embedding " +
                          str(x) + ": " + str(int(100 * wi / len(word_id))) +
                          "%")
                print(string, end="\r")
        print()

        with open("tmp_conc/" + str(x), "w+", encoding="utf-8") as file:
            for wi, w in enumerate(word_id):
                if first_emb:
                    print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)
                else:
                    print(" ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)

                if wi % 1000 == 0:
                    string = ("<" + str(datetime.datetime.now()) + "> " +
                              "Saving embedding " + str(x) + " to file : " +
                              str(int(100 * wi / len(word_id))) + "%")
                    print(string, end="\r")

            print()

        first_emb = False

    printTrace("Concatenation...")

    excec_com = "paste -d ' ' "
    for x in range(len(embeddings_path)):
        excec_com = excec_com + "tmp_conc/" + str(x) + " "
    excec_com = excec_com + "> " + str(out_path)
    print(excec_com)
    os.system(excec_com)

    excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) +
                 "\\n/' " + str(out_path))
    print(excec_com)
    os.system(excec_com)

    try:
        shutil.rmtree("/tmp_conc")
    except:
        print("Could not delete the tmp folder, do it manually")

    printTrace("Done. Meta embedding saved in " + str(out_path))
コード例 #23
0
def main():
    parser = argparse.ArgumentParser()
    inputtype = parser.add_mutually_exclusive_group(required=True)
    inputtype.add_argument('-i', '--embedding', type=str)
    inputtype.add_argument('-d', '--directory', type=str)

    #parser.add_argument('-n', '--name_of_embedding', default=None)
    parser.add_argument('-l', '--lowercase_dataset', action='store_true')
    parser.add_argument('-lg', '--language', nargs='+', default=['en'])

    parser.add_argument('-p', '--add_lang_prefix', action='store_true')

    parser.add_argument('-v', '--vocab', type=str, default=None)

    args = parser.parse_args()

    emb_list = []

    if args.embedding is not None:
        emb_list.append(args.embedding)
    else:
        emb_list = [
            os.path.join(args.directory, f) for f in os.listdir(args.directory)
            if os.path.isfile(os.path.join(args.directory, f))
        ]

    for emb_i, emb_path in enumerate(emb_list):

        printTrace('Evaluating Embedding ' + str(emb_i + 1) + ' of ' +
                   str(len(emb_list)) + ' : ' + str(emb_path))

        emb = load_embedding(emb_path,
                             vocabulary=(None if args.vocab is None else
                                         vocab_from_path(args.vocab)),
                             lower=False,
                             length_normalize=False,
                             delete_duplicates=True)

        for lang in args.language:

            lang1prefix = None
            lang2prefix = None

            if args.add_lang_prefix:
                if lang == 'en':
                    lang1prefix = 'en'
                    lang2prefix = 'en'
                elif lang == 'es':
                    lang1prefix = 'es'
                    lang2prefix = 'es'
                elif lang == 'enes':
                    lang1prefix = 'en'
                    lang2prefix = 'es'
                else:
                    logging.warning(
                        'Language not supported, could not add prefix')

            if not os.path.exists('Results_' + lang):
                os.makedirs('Results_' + lang)

            print('>>> Results deleting oov <<< ')

            a, b = results_to_csv(evaluate_on_all(
                emb,
                backoff_vector=None,
                lowercase_dataset=args.lowercase_dataset,
                lang=lang,
                lang1prefix=lang1prefix,
                lang2prefix=lang2prefix),
                                  printRes=False,
                                  returnRes=True)
            export_to_csv(
                txtResults=a,
                txtCov=b,
                name=emb_path,
                filenameResults='Results_' + lang + '/Sim_Results_delete.csv',
                filenameCoverage='Results_' + lang + '/Sim_Coverage.csv')

            print('>>> Result using mean of all word vectors as OOV <<<')

            a, b = results_to_csv(evaluate_on_all(
                emb,
                backoff_vector=np.mean(emb.vectors, axis=0),
                lowercase_dataset=args.lowercase_dataset,
                lang=lang,
                lang1prefix=lang1prefix,
                lang2prefix=lang2prefix),
                                  printRes=False,
                                  returnRes=True)
            export_to_csv(
                txtResults=a,
                txtCov=b,
                name=emb_path,
                filenameResults='Results_' + lang + '/Sim_Results_mean.csv',
                filenameCoverage='Results_' + lang + '/Sim_Coverage.csv')

    print('Results have been exported in csv format to the Results folder')
コード例 #24
0
import argparse
import os
import sys
import os
sys.path.insert(0, '../')
from embedding import load_embedding

parser = argparse.ArgumentParser()

parser.add_argument('-i', '--input', required=True)
parser.add_argument('-o', '--output', required=True)
parser.add_argument('-p', '--prefix', required=True)
args = parser.parse_args()

emb = load_embedding(args.input,
                     vocabulary=None,
                     length_normalize=False,
                     normalize_dimensionwise=False, to_unicode=True,
                     lower=False, delete_duplicates=True)

n_words = 0

with open(args.output,'w') as file:
    for word in emb.words:
        if word.split('/')[0] == args.prefix:
            print(''.join(word.split('/')[1:]) + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]), file=file)
            n_words+=1

excec_com = 'sed -i \'1s/^/' + str(n_words) + ' ' + str(emb.dims) + '\\n/\' ' + str(args.output)
print(excec_com)
os.system(excec_com)
コード例 #25
0
def inference(sentence):
    print("input sentence:")
    print(sentence)
    sentences = []
    words = sentence.split(' ')
    sentences.append(words)
    sentences_embedding = embedding(sentences, batch_size,
                                    single_sentence_length)
    print("input embedding:")
    print(sentences_embedding)
    output = mod_inference(sentences_embedding)
    print("output vector:")
    print(output)
    return output


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--embedding",
                        dest="embedding",
                        help="Read embedding from the path",
                        metavar="FILE",
                        required=True)
    parser.add_argument('--seed', nargs='?', dest="seed", const=1,
                        type=int)  # set default random seed to 1
    args = parser.parse_args()

    load_embedding(args.embedding, args.seed)
    inference(
        "in his first stab at the form , jacquot takes a slightly anarchic approach that works only sporadically ."
    )