Ejemplo n.º 1
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Project the source embeddings into the target embedding space maximizing the squared Euclidean distances for the given dictionary')
    parser.add_argument('src_embeddings', help='the source embeddings')
    parser.add_argument('trg_embeddings', help='the target embeddings')
    parser.add_argument('-c', '--orthogonal', dest='orthogonal', action='store_true', help='use orthogonal constrained mapping (default)')
    parser.add_argument('-u', '--unconstrained', dest='orthogonal', action='store_false', help='use unconstrained mapping')
    parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)')
    parser.add_argument('-o', '--output', default=sys.stdout.fileno(), help='the output projected embedding file (defaults to stdout)')
    parser.add_argument('--encoding', default='utf-8', action='store_true', help='the character encoding for input/output (defaults to utf-8)')
    parser.set_defaults(orthogonal=True)
    args = parser.parse_args()

    # Read input embeddings
    srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape')
    src_words, src_matrix = embeddings.read(srcfile)
    trg_words, trg_matrix = embeddings.read(trgfile)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Read dictionary
    f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape')
    src_indices = []
    trg_indices = []
    for line in f:
        src, trg = line.split()
        
        try:
            src_ind = src_word2ind[src]
            #print (str(src_ind))
            trg_ind = trg_word2ind[trg]
            src_indices.append(src_ind)

            trg_indices.append(trg_ind)
        except KeyError:
            print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)

    # Learn the linear transformation minimizing the squared Euclidean distances (see paper)
    x = src_matrix[src_indices]
    #print (x)
    z = trg_matrix[trg_indices]
    #print (z)
    if args.orthogonal:  # orthogonal mapping
        u, s, vt = np.linalg.svd(np.dot(z.T, x))
        #print (u)
        #print("ortho")
        #u, s, vt = sparse.linalg.svds(np.dot(z.T, x))
        w = np.dot(vt.T, u.T)
    else:  # unconstrained mapping
        x_pseudoinv = np.dot(np.linalg.inv(np.dot(x.T, x)), x.T)
        w = np.dot(x_pseudoinv, z)

    #print str(w)
    # Project and write source embeddings
    f = open(args.output, mode='w', encoding=args.encoding, errors='surrogateescape')
    embeddings.write(src_words, np.dot(src_matrix, w), f)
    a=embeddings.see_mapping(w)
Ejemplo n.º 2
0
def map_embedding_db(in_emb_fname,
                     out_emb_fname,
                     vocab_type,
                     mapping_model_dir,
                     latent_space=True):
    """
     Maps all the vocabulary in `in_emb_fname` to target language space using the model in `mapping_model_dir`
     The resultant embeddings are stored in `out_emb_fname`

     vocab_type is one of `src` or `tgt`. Indicates the source or target language as per the trained model. 

     latent_space: If true, the embeddings are mapped to latent space. Otherwise, 
        they are mapped to the embedding space of the other language. 
    """

    print('Loading train data...')
    # Read input embeddings
    with open(in_emb_fname, 'r', encoding='utf-8',
              errors='surrogateescape') as srcfile:
        src_words, x = embeddings.read(srcfile, max_voc=0, dtype='float32')
        src_word2ind = {word: i for i, word in enumerate(src_words)}

    model_params = read_model(mapping_model_dir)
    xw = apply_mapping(x, vocab_type, model_params, latent_space)

    with open(out_emb_fname, 'w', encoding='utf-8') as outfile:
        embeddings.write(src_words, xw, outfile)
Ejemplo n.º 3
0
def deal_my_with_position_File(file_content, fw_path, fEmPosition):
    srcfile = open(fw_path, encoding='utf-8', errors='surrogateescape')
    src_words, src_matrix = embeddings.read(srcfile)
    map_my_position = {}
    fr_file = open(file_content, 'r', encoding='utf-8')
    fr_file = fr_file.readlines()
    for singleWord in fr_file:
        singleWord = singleWord.strip().split('\t')
        map_my_position[singleWord[1]] = singleWord[0]
    for line in range(len(src_words)):
        src_words[line] = map_my_position[src_words[line]]
    targetfile = open(fEmPosition,
                      mode='w',
                      encoding='utf-8',
                      errors='surrogateescape')
    embeddings.write(src_words, src_matrix, targetfile)
Ejemplo n.º 4
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Normalize word embeddings')
    parser.add_argument(
        'actions',
        choices=['none', 'unit', 'center', 'unitdim', 'centeremb'],
        nargs='+',
        help='the actions to perform in order')
    parser.add_argument(
        '-i',
        '--input',
        default=sys.stdin.fileno(),
        help='the input word embedding file (defaults to stdin)')
    parser.add_argument(
        '-o',
        '--output',
        default=sys.stdout.fileno(),
        help='the output word embedding file (defaults to stdout)')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        action='store_true',
        help='the character encoding for input/output (defaults to utf-8)')
    args = parser.parse_args()

    # Read input embeddings

    f = open(args.input, encoding=args.encoding, errors='surrogateescape')
    words, matrix = embeddings.read(f)

    # Perform normalization actions
    for action in args.actions:
        if action == 'unit':
            matrix = embeddings.length_normalize(matrix)
        elif action == 'center':
            matrix = embeddings.mean_center(matrix)
        elif action == 'unitdim':
            matrix = embeddings.length_normalize_dimensionwise(matrix)
        elif action == 'centeremb':
            matrix = embeddings.mean_center_embeddingwise(matrix)

    # Write normalized embeddings
    f = open(args.output,
             mode='w',
             encoding=args.encoding,
             errors='surrogateescape')
    embeddings.write(words, matrix, f)
Ejemplo n.º 5
0
def deal_matrix_words(srcfile, trgfile):

    srcfile = open(srcfile, encoding='utf-8', errors='surrogateescape')
    #trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
    src_words, src_matrix = embeddings.read(srcfile)
    dic_word_matrix = {}
    srcdata = srcfile.readlines()
    for line in range(len(src_words)):
        srcSingle = src_words[line].split('_')[0]
        if srcSingle not in dic_word_matrix:
            dic_word_matrix[srcSingle] = [list(src_matrix[line])]
        else:
            dic_word_matrix[srcSingle].append(list(src_matrix[line]))
    words = []
    matrixs = []
    for itemKey, itemValue in dic_word_matrix.items():
        itemValue = np.mean(np.array(itemValue), 0)
        words.append(itemKey)
        matrixs.append(itemValue)
    trgfile = open(trgfile,
                   mode='w',
                   encoding='utf-8',
                   errors='surrogateescape')
    embeddings.write(words, matrixs, trgfile)
Ejemplo n.º 6
0
                      (epoch + 1, i + 1, running_loss / 20))
                running_loss = 0.0

    source_file = open('new_embedding_size200.en',
                       mode='w',
                       encoding='utf-8',
                       errors='surrogateescape')
    target_file = open('new_embedding_size200.de',
                       mode='w',
                       encoding='utf-8',
                       errors='surrogateescape')
    en_words, en_vec = embeddings.read(source_file)
    de_words, de_vec = embeddings.read(target_file)

    input_view1, input_view2 = Variable(
        torch.from_numpy(en_vec).cuda()), Variable(
            torch.from_numpy(de_vec).cuda())

    res_envec, res_devec = net(input_view1.float(), input_view2.float())

    src_file = open('res.en', encoding='utf-8', errors='surrogateescape')
    trg_file = open('res.de', encoding='utf-8', errors='surrogateescape')
    embeddings.write(en_words, res_envec.numpy(), src_file)
    embeddings.write(de_words, res_devec.numpy(), trg_file)

    source_file.close()
    target_file.close()
    src_file.close()
    trg_file.close()
    print('Finished Training')
Ejemplo n.º 7
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)')
    mapping_group.add_argument(
        '-d',
        '--dictionary',
        default=sys.stdin.fileno(),
        help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_group.add_argument(
        '-c',
        '--orthogonal',
        dest='orthogonal',
        action='store_true',
        help='use orthogonal constrained mapping (default)')
    mapping_group.add_argument('-u',
                               '--unconstrained',
                               dest='orthogonal',
                               action='store_false',
                               help='use unconstrained mapping')
    parser.set_defaults(orthogonal=True)
    self_learning_group = parser.add_argument_group(
        'self-learning arguments',
        'Optional arguments for self-learning (ACL 2017)')
    self_learning_group.add_argument('--self_learning',
                                     action='store_true',
                                     help='enable self-learning')
    self_learning_group.add_argument(
        '--direction',
        choices=['forward', 'backward', 'union'],
        default='forward',
        help='the direction for dictionary induction (defaults to forward)')
    self_learning_group.add_argument(
        '--numerals',
        action='store_true',
        help=
        'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary'
    )
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--validation',
        default=None,
        help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument(
        '--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')
    args = parser.parse_args()

    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile)
    trg_words, z = embeddings.read(trgfile)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    src_indices = []
    trg_indices = []
    if args.numerals:
        if args.dictionary != sys.stdin.fileno():
            print('WARNING: Using numerals instead of the training dictionary',
                  file=sys.stderr)
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {
            word
            for word in src_words if numeral_regex.match(word) is not None
        }
        trg_numerals = {
            word
            for word in trg_words if numeral_regex.match(word) is not None
        }
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.dictionary,
                 encoding=args.encoding,
                 errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation,
                 encoding=args.encoding,
                 errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                pass
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')

    # Normalize embeddings
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)

    # Training loop
    prev_objective = objective = -100.
    it = 1
    t = time.time()
    while it == 1 or objective - prev_objective >= args.threshold:

        # Update the embedding mapping
        if args.orthogonal:  # orthogonal mapping
            u, s, vt = np.linalg.svd(np.dot(z[trg_indices].T, x[src_indices]))
            w = np.dot(vt.T, u.T)
        else:  # unconstrained mapping
            x_pseudoinv = np.dot(
                np.linalg.inv(np.dot(x[src_indices].T, x[src_indices])),
                x[src_indices].T)
            w = np.dot(x_pseudoinv, z[trg_indices])
        xw = x.dot(w)

        # Self-learning
        if args.self_learning:

            # Update the training dictionary
            best_sim_forward = np.full(x.shape[0], -100.)
            src_indices_forward = range(x.shape[0])
            trg_indices_forward = np.zeros(x.shape[0], dtype=int)
            best_sim_backward = np.full(z.shape[0], -100.)
            src_indices_backward = np.zeros(z.shape[0], dtype=int)
            trg_indices_backward = range(z.shape[0])
            for i in range(0, x.shape[0], MAX_DIM_X):
                for j in range(0, z.shape[0], MAX_DIM_Z):
                    sim = xw[i:i + MAX_DIM_X].dot(z[j:j + MAX_DIM_Z].T)
                    for k in range(sim.shape[0]):
                        l = sim[k].argmax()
                        if sim[k, l] > best_sim_forward[i + k]:
                            best_sim_forward[i + k] = sim[k, l]
                            trg_indices_forward[i + k] = j + l
                    if args.direction in (
                            'backward', 'union'):  # Slow, only do if necessary
                        for l in range(sim.shape[1]):
                            k = sim[:, l].argmax()
                            if sim[k, l] > best_sim_backward[j + l]:
                                best_sim_backward[j + l] = sim[k, l]
                                src_indices_backward[j + l] = i + k
                    sim = None
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = np.concatenate(
                    (src_indices_forward, src_indices_backward))
                trg_indices = np.concatenate(
                    (trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            prev_objective = objective
            if args.direction == 'forward':
                objective = np.mean(best_sim_forward)
            elif args.direction == 'backward':
                objective = np.mean(best_sim_backward)
            elif args.direction == 'union':
                objective = (np.mean(best_sim_forward) +
                             np.mean(best_sim_backward)) / 2

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                accuracy = np.mean([
                    1 if trg_indices_forward[src] in trg else 0
                    for src, trg in validation.items()
                ])
                similarity = np.mean([
                    np.max(z[list(trg)].dot(xw[src]))
                    for src, trg in validation.items()
                ])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                      file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 *
                                                               objective),
                      file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 *
                                                                   similarity),
                          file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 *
                                                                   accuracy),
                          file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(
                        100 * validation_coverage),
                          file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 *
                    validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(
                    it, 100 * objective, val, duration),
                      file=log)
                log.flush()

        t = time.time()
        it += 1

    # Write mapped embeddings
    srcfile = open(args.src_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, z, trgfile)
    srcfile.close()
    trgfile.close()
Ejemplo n.º 8
0
def prefix_oov_embeddings_for_bilingual_dict(train_dict_fname,
                                             test_dict_fname,
                                             src_emb_fname,
                                             tgt_emb_fname,
                                             out_src_emb_fname,
                                             out_tgt_emb_fname,
                                             max_voc=200000):
    """
    Adds the embeddings for OOV words in the training and test dictionaries to the embedding file. 
    This is done by using prefix of the word as well as words for which oov is a prefix. 
    Note that the output embedding file will contain only the OOV words plus 
    the first max_voc words in the original embedding file.
    
    train_dict_fname: 
    test_dict_fname: 
    src_emb_fname: embedding file for source language 
    tgt_emb_fname: embedding file for target language
    out_src_emb_fname: output embedding file for source language 
    out_tgt_emb_fname: output embedding file for target language    
    max_voc: number of vocab items to process from the embedding file

    """

    src_oov_words, src_emb_info, tgt_oov_words, tgt_emb_info = \
        get_oov_info_for_bilingual_dict(train_dict_fname, test_dict_fname,
                       src_emb_fname, tgt_emb_fname, max_voc)

    src_vcb_words, src_emb = src_emb_info
    tgt_vcb_words, tgt_emb = tgt_emb_info

    ## compute embeddings for OOV
    ##### cat queries.txt | ./fasttext print-word-vectors model.bin
    src_oov_final_words, src_oov_emb = compute_prefix_embeddings(
        src_oov_words, (src_vcb_words, src_emb))
    tgt_oov_final_words, tgt_oov_emb = compute_prefix_embeddings(
        tgt_oov_words, (tgt_vcb_words, tgt_emb))

    if (len(src_oov_words) != len(src_oov_final_words)):
        print(
            'WARNING: Embeddings not computed for {} words out of {} OOV source words'
            .format(
                len(src_oov_words) - len(src_oov_final_words),
                len(src_oov_words)))

    if (len(tgt_oov_words) != len(tgt_oov_final_words)):
        print(
            'WARNING: Embeddings not computed for {} words out of {} OOV target words'
            .format(
                len(tgt_oov_words) - len(tgt_oov_final_words),
                len(tgt_oov_words)))

    ## write new embeddings files to disk
    ## put the OOV words first followed by words in the original embeddings file
    with open(out_src_emb_fname, 'w', encoding='utf-8' ) as out_src_emb_file, \
         open(out_tgt_emb_fname, 'w', encoding='utf-8' ) as out_tgt_emb_file:
        embeddings.write(src_oov_final_words + src_vcb_words,
                         np.concatenate([src_oov_emb, src_emb]),
                         out_src_emb_file)
        embeddings.write(tgt_oov_final_words + tgt_vcb_words,
                         np.concatenate([tgt_oov_emb, tgt_emb]),
                         out_tgt_emb_file)
Ejemplo n.º 9
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)')
    parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory')
    parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)')
    parser.add_argument('--test-dict', help='the test dictionary file')

    recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios')
    recommended_type = recommended_group.add_mutually_exclusive_group()
    recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary')
    recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary')
    recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words')
    recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words')
    recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system')
    recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system')
    recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization')
    # Note: changed the argument so that dictionary is supplied with -d instead
    recommended_type.add_argument('--acl2017_seed', action='store_true', help='reproduce our ACL 2017 system with a seed dictionary')
    recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system')
    # still requires specifying a seed dictionary or another init
    recommended_type.add_argument('--ruder_emnlp2018', action='store_true', help='reproduce EMNLP 2018 latent-variable model of Ruder et al.')
    recommended_type.add_argument('--ruder_emnlp2018_backward', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction')
    recommended_type.add_argument('--ruder_emnlp2018_artetxe_acl2018_unsupervised', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction')
    recommended_type.add_argument('--ruder_emnlp2018_artetxe_acl2018', action='store_true', help='reproduce Ruder et al. (EMNLP 2018) with matching in backward direction')

    init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary')
    init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary')
    init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization')
    init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization')

    mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order')
    mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping')

    self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning')
    self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning')
    self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)')
    self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction')
    self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)')
    self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration')

    lat_var_group = parser.add_argument_group('arguments for latent-variable model', 'Arguments for latent-variable model')
    lat_var_group.add_argument('--lat-var', action='store_true', help='use the latent-variable model')
    lat_var_group.add_argument('--n-similar', type=int, default=3, help='# of most similar trg indices used for sparsifying in latent-variable model')
    lat_var_group.add_argument('--n-repeats', default=1, type=int, help='repeats embeddings to get 2:2, 3:3, etc. alignment in latent-variable model')
    lat_var_group.add_argument('--asym', default='1:1', help='specify 1:2 or 2:1 for assymmetric matching in latent-variable model')
    args = parser.parse_args()

    if args.supervised is not None:
        parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.semi_supervised is not None:
        parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.identical:
        parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)

    # reduce stochastic interval
    # note: just backward direction works surprisingly well
    if args.ruder_emnlp2018_artetxe_acl2018_unsupervised:
        parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=40000, csls_neighborhood=10, lat_var=True, n_similar=3, direction='union', stochastic_interval=3)
    if args.ruder_emnlp2018_artetxe_acl2018:
        parser.set_defaults(normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=40000, csls_neighborhood=10, lat_var=True, n_similar=3, direction='union', stochastic_interval=3)
    if args.ruder_emnlp2018:
        parser.set_defaults(orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000, lat_var=True, n_similar=3, vocabulary_cutoff=40000)
    if args.ruder_emnlp2018_backward:
        parser.set_defaults(orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='backward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000, lat_var=True, n_similar=3, vocabulary_cutoff=40000)

    if args.unsupervised or args.acl2018:
        parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.aaai2018:
        parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.acl2017:
        parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
    if args.acl2017_seed:
        parser.set_defaults(init_dictionary=args.init_dictionary, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
    if args.emnlp2016:
        parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    if args.verbose:
        print("Info: arguments\n\t" + "\n\t".join(
            ["{}: {}".format(a, v) for a, v in vars(args).items()]),
              file=sys.stderr)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=200000)
    trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=200000)

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    xp.random.seed(args.seed)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)

    # Build the seed dictionary
    src_indices = []
    trg_indices = []
    if args.init_unsupervised:
        if args.verbose:
            print('Using unsupervised initialization...')
        sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab)
        u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False)
        xsim = (u*s).dot(u.T)
        u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False)
        zsim = (u*s).dot(u.T)
        del u, s, vt
        xsim.sort(axis=1)
        zsim.sort(axis=1)
        embeddings.normalize(xsim, args.normalize)
        embeddings.normalize(zsim, args.normalize)
        sim = xsim.dot(zsim.T)
        if args.csls_neighborhood > 0:
            knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood)
            knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood)
            sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2
        if args.direction == 'forward':
            src_indices = xp.arange(sim_size)
            trg_indices = sim.argmax(axis=1)
        elif args.direction == 'backward':
            src_indices = sim.argmax(axis=0)
            trg_indices = xp.arange(sim_size)
        elif args.direction == 'union':
            src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0)))
            trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size)))
        del xsim, zsim, sim
    elif args.init_numerals:
        if args.verbose:
            print('Using numerals as seeds...')
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {word for word in src_words if numeral_regex.match(word) is not None}
        trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None}
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    elif args.init_identical:
        identical = set(src_words).intersection(set(trg_words))
        if args.verbose:
            print('Using identical strings as seeds...')
            print(f'Found {len(identical)} identical strings.')
        for word in identical:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
        print(f'Using a dictionary of size {len(src_indices)}.')

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation, encoding=args.encoding, errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape')

    # Allocate memory
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)
    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff)
    simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype)
    simbwd = xp.empty((args.batch_size, src_size), dtype=dtype)
    if args.validation is not None:
        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    best_sim_forward = xp.full(src_size, -100, dtype=dtype)
    src_indices_forward = xp.arange(src_size)
    trg_indices_forward = xp.zeros(src_size, dtype=int)
    best_sim_backward = xp.full(trg_size, -100, dtype=dtype)
    src_indices_backward = xp.zeros(trg_size, dtype=int)
    trg_indices_backward = xp.arange(trg_size)
    knn_sim_fwd = xp.zeros(src_size, dtype=dtype)
    knn_sim_bwd = xp.zeros(trg_size, dtype=dtype)

    # Training loop
    best_objective = objective = -100.
    it = 1
    last_improvement = 0
    keep_prob = args.stochastic_initial
    t = time.time()
    end = not args.self_learning
    while True:

        # Increase the keep probability if we have not improve in args.stochastic_interval iterations
        if it - last_improvement > args.stochastic_interval:
            if keep_prob >= 1.0:
                end = True
            keep_prob = min(1.0, args.stochastic_multiplier*keep_prob)
            last_improvement = it

        # Update the embedding mapping
        if args.orthogonal or not end:  # orthogonal mapping
            u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw[:] = z
        elif args.unconstrained:  # unconstrained mapping
            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
            x.dot(w, out=xw)
            zw[:] = z
        else:  # advanced mapping

            # TODO xw.dot(wx2, out=xw) and alike not working
            xw[:] = x
            zw[:] = z

            # STEP 1: Whitening
            def whitening_transformation(m):
                u, s, vt = xp.linalg.svd(m, full_matrices=False)
                return vt.T.dot(xp.diag(1/s)).dot(vt)
            if args.whiten:
                wx1 = whitening_transformation(xw[src_indices])
                wz1 = whitening_transformation(zw[trg_indices])
                xw = xw.dot(wx1)
                zw = zw.dot(wz1)

            # STEP 2: Orthogonal mapping
            wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices]))
            wz2 = wz2_t.T
            xw = xw.dot(wx2)
            zw = zw.dot(wz2)

            # STEP 3: Re-weighting
            xw *= s**args.src_reweight
            zw *= s**args.trg_reweight

            # STEP 4: De-whitening
            if args.src_dewhiten == 'src':
                xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.src_dewhiten == 'trg':
                xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
            if args.trg_dewhiten == 'src':
                zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.trg_dewhiten == 'trg':
                zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))

            # STEP 5: Dimensionality reduction
            if args.dim_reduction > 0:
                xw = xw[:, :args.dim_reduction]
                zw = zw[:, :args.dim_reduction]

        # Self-learning
        if end:
            break
        else:
            # Update the training dictionary
            sims = np.zeros((src_size, trg_size), dtype=dtype)
            if args.direction in ('forward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, trg_size, simbwd.shape[0]):
                        j = min(i + simbwd.shape[0], trg_size)
                        zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                        knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, src_size, simfwd.shape[0]):
                    j = min(i + simfwd.shape[0], src_size)
                    xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                    simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j])
                    simfwd[:j-i] -= knn_sim_bwd/2  # Equivalent to the real CSLS scores for NN
                    simfwd[:j-i] = dropout(simfwd[:j-i], 1 - keep_prob)
                    if not args.lat_var:
                        # we get a dimension mismatch here as lat_var may produce fewer seeds
                        simfwd[:j-i].argmax(axis=1, out=trg_indices_forward[i:j])
                    sims[i:j] = simfwd
                if args.lat_var:
                    # TODO check if we can save memory by not storing a large sims matrix
                    src_indices_forward, trg_indices_forward = lat_var.lat_var(
                        xp, sims, args.n_similar, args.n_repeats, args.batch_size, args.asym)
            if args.direction in ('backward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, src_size, simfwd.shape[0]):
                        j = min(i + simfwd.shape[0], src_size)
                        xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                        knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, trg_size, simbwd.shape[0]):
                    j = min(i + simbwd.shape[0], trg_size)
                    zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                    simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j])
                    simbwd[:j-i] -= knn_sim_fwd/2  # Equivalent to the real CSLS scores for NN
                    simbwd[:j-i] = dropout(simbwd[:j-i], 1 - keep_prob)
                    if not args.lat_var:
                        simbwd[:j-i].argmax(axis=1,out=src_indices_backward[i:j])
                    sims[i:j] = simbwd
                if args.lat_var:
                    # swap the order of the indices
                    trg_indices_backward, src_indices_backward = lat_var.lat_var(
                        xp, sims, args.n_similar, args.n_repeats, args.batch_size, args.asym)
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = xp.concatenate((src_indices_forward, src_indices_backward))
                trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward))
            # elif args.direction == 'intersection':
            #     fwd_pairs = zip(src_indices_forward, trg_indices_forward)
            #     bwd_pairs = zip(src_indices_backward, trg_indices_backward)
            #     src_indices, trg_indices = zip(*set(fwd_pairs).intersection(bwd_pairs))
            #     src_indices, trg_indices = xp.array(src_indices), xp.array(trg_indices)

            # Objective function evaluation
            if args.direction == 'forward':
                objective = xp.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = xp.mean(best_sim_backward).tolist()
            elif args.direction == 'union':
                objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2
            if objective - best_objective >= args.threshold:
                last_improvement = it
                best_objective = objective

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                src = list(validation.keys())
                xw[src].dot(zw.T, out=simval)
                nn = asnumpy(simval.argmax(axis=1))
                accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))])
                similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 * objective), file=sys.stderr)
                print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 * similarity), file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy), file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log)
                log.flush()

        t = time.time()
        it += 1

        if args.test_dict:
            # save the embeddings for evaluation
            with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile,\
                    open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile:
                embeddings.write(src_words, xw, srcfile)
                embeddings.write(trg_words, zw, trgfile)

            # EVALUATING TRANSLATION
            print('Evaluating translation...')

            # we skip length normalization here

            # Read dictionary and compute coverage
            f = open(args.test_dict, encoding=args.encoding,
                     errors='surrogateescape')
            src2trg = collections.defaultdict(set)
            oov = set()
            vocab = set()
            for line in f:
                src, trg = line.split()
                try:
                    src_ind = src_word2ind[src]
                    trg_ind = trg_word2ind[trg]
                    src2trg[src_ind].add(trg_ind)
                    vocab.add(src)
                except KeyError:
                    oov.add(src)
            src = list(src2trg.keys())
            oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
            coverage = len(src2trg) / (len(src2trg) + len(oov))

            BATCH_SIZE = 500

            # Find translations
            translation = collections.defaultdict(int)

            # we just use nearest neighbour for retrieval
            for i in range(0, len(src), BATCH_SIZE):
                j = min(i + BATCH_SIZE, len(src))
                similarities = xw[src[i:j]].dot(zw.T)
                nn = similarities.argmax(axis=1).tolist()
                for k in range(j - i):
                    translation[src[i + k]] = nn[k]

            # Compute accuracy
            accuracy = np.mean(
                [1 if translation[i] in src2trg[i] else 0 for i in src])
            print('Coverage:{0:7.2%}  Accuracy:{1:7.2%}'.format(coverage, accuracy))

    # Write mapped embeddings
    with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile, \
            open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile:
        embeddings.write(src_words, xw, srcfile)
        embeddings.write(trg_words, zw, trgfile)
Ejemplo n.º 10
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument('dict_output', default='dictionary.pkl', help='the output dictionary pickle file')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)')
    parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory')
    parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)')

    recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios')
    recommended_type = recommended_group.add_mutually_exclusive_group()
    recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary')
    recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary')
    recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words')
    recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words')
    recommended_type.add_argument('--future', action='store_true', help='experiment with stuff')
    recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system')
    recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system')
    recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization')
    recommended_type.add_argument('--acl2017_seed', metavar='DICTIONARY', help='reproduce our ACL 2017 system with a seed dictionary')
    recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system')

    init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary')
    init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary')
    init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization')
    init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization')

    mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order')
    mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping')
    
    future_group = parser.add_argument_group('experimental arguments', 'Experimental arguments')
    future_group.add_argument('--max_align', type=int, default=1, help='Number of top-ranked elements to align to each word (defaults to 1=base)')
    future_group.add_argument('--align_weight', choices=['unit', 'rr', 'softmax'], default='rr', help='Weights assigned to ranked elements in maximization phase (unit - no weighting; rr - reciprocal rank; softmax - NOT IMPLEMENTED YET)')

    self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning')
    self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning')
    self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)')
    self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction')
    self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)')
    self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument('--log', default='map.log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration')
    args = parser.parse_args()

    if args.supervised is not None:
        parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.semi_supervised is not None:
        parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.identical:
        parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    
    if args.unsupervised or args.future:
        parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10, max_align=2, align_weight='rr')
    if args.unsupervised or args.acl2018:
        parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.aaai2018:
        parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.acl2017:
        parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
    if args.acl2017_seed:
        parser.set_defaults(init_dictionary=args.acl2017_seed, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
    if args.emnlp2016:
        parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    print('reading embeddings...')
    srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)
    print('embeddings read')

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
        print('CUDA loaded')
    else:
        xp = np
    xp.random.seed(args.seed)

    # Build word to index map (only relevant in supervised learning or with validation)
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    print(f'mapped {len(src_words)} source words')
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}
    print(f'mapped {len(trg_words)} target words')

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)
    print('normalization complete')

    # Build the seed dictionary
    src_indices = []
    trg_indices = []
    if args.init_unsupervised:
        sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab)
        u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False)
        xsim = (u*s).dot(u.T)
        u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False)
        zsim = (u*s).dot(u.T)
        del u, s, vt
        xsim.sort(axis=1)
        zsim.sort(axis=1)
        embeddings.normalize(xsim, args.normalize)
        embeddings.normalize(zsim, args.normalize)
        sim = xsim.dot(zsim.T)
        if args.csls_neighborhood > 0:
            knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood)
            knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood)
            sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2
        if args.direction == 'forward':
            src_indices = xp.arange(sim_size)
            trg_indices = sim.argmax(axis=1)
        elif args.direction == 'backward':
            src_indices = sim.argmax(axis=0)
            trg_indices = xp.arange(sim_size)
        elif args.direction == 'union':
            src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0)))
            trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size)))
        del xsim, zsim, sim
        print(f'initialized unsupervised dictionary')
    elif args.init_numerals:
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {word for word in src_words if numeral_regex.match(word) is not None}
        trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None}
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
        print('initialized numeral dictionary')
    elif args.init_identical:
        identical = set(src_words).intersection(set(trg_words))
        for word in identical:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
        print('initialized identical dictionary')
    else:
        f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
        f.close()
        print('initialized seed dictionary')

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation, encoding=args.encoding, errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))
        print(f'loaded validation dictionary with {validation_coverage:.3f} coverage')

    # Create log file
    if args.log:
        log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape')
        print(f'logging into {args.log}')

    # Allocate memory
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)
    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff)
    simfwd = xp.empty((min(src_size, args.batch_size), trg_size), dtype=dtype)
    simbwd = xp.empty((min(trg_size, args.batch_size), src_size), dtype=dtype)
    #argsimsf = xp.empty((min(src_size, args.batch_size), args.max_align), dtype=int)
    #argsimsb = xp.empty((min(trg_size, args.batch_size), args.max_align), dtype=int)
    argsimsf = xp.empty((min(src_size, args.batch_size), 1), dtype=int)
    argsimsb = xp.empty((min(trg_size, args.batch_size), 1), dtype=int)
    if args.validation is not None:
        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    best_sim_forward = xp.full(src_size, -100, dtype=dtype)
    src_indices_forward = xp.array(list(range(src_size)) * args.max_align)
    trg_indices_forward = xp.zeros(src_size * args.max_align, dtype=int)
    best_sim_backward = xp.full(trg_size, -100, dtype=dtype)
    src_indices_backward = xp.zeros(trg_size * args.max_align, dtype=int)
    trg_indices_backward = xp.array(list(range(trg_size)) * args.max_align)
    xr = xp.zeros(((src_size+trg_size) * args.max_align, x.shape[1]), dtype=dtype)  # assumes "both" param
    zr = xp.zeros(((src_size+trg_size) * args.max_align, z.shape[1]), dtype=dtype)  # assumes "both" param
    all_coefs = xp.zeros(((src_size+trg_size) * args.max_align, 1), dtype=dtype)
    knn_sim_fwd = xp.zeros(src_size, dtype=dtype)
    knn_sim_bwd = xp.zeros(trg_size, dtype=dtype)

    # Training loop
    best_objective = objective = -100.
    it = 1
    last_improvement = 0
    keep_prob = args.stochastic_initial
    t = time.time()
    end = not args.self_learning
    print('starting training')
    while True:
        if it % 50 == 0:
            print(f'starting iteration {it}')

        # Increase the keep probability if we have not improved in args.stochastic_interval iterations
        if it - last_improvement > args.stochastic_interval:
            if keep_prob >= 1.0:
                end = True
            keep_prob = min(1.0, args.stochastic_multiplier*keep_prob)
            last_improvement = it

        # Update the embedding mapping (only affecting vectors that have dictionary mappings)
        if args.orthogonal or not end:  # orthogonal mapping
            if it == 1:
                # only initialized alignment available
                u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            else:
                if args.align_weight == 'softmax':
                    ### TODO individualized softmax coefficients ###
                    raise 'Softmax weights not supported yet'           
                else:
                    ### TODO I'm assuming here that the alignment method is 'both', so everything's double
                    ### TODO all_coefs can be computed outside the iteration loop
                    # format: src_size_0, ..., src_size_k-1, trg_size_0, ..., trg_size_k-1
                    ncopies = args.max_align
                    cutoffs = list(range(src_size*ncopies)[::src_size]) \
                              + list(range(src_size*ncopies,(src_size+trg_size)*ncopies)[::trg_size])
                    if args.align_weight == 'rr':
                        coefs = [1. / (k+1) for k in range(ncopies)] * 2            
                    else:  # 'unit'
                        coefs = [1.] * (ncopies * 2)
                    for cf, co_s, co_e in zip(coefs, cutoffs, cutoffs[1:] + [len(all_coefs)]):
                        all_coefs[co_s:co_e] = cf
                    zr = z[trg_indices] * all_coefs
                    xr = x[src_indices] * all_coefs
                    u, s, vt = xp.linalg.svd(zr.T.dot(xr))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw[:] = z
        elif args.unconstrained:  # unconstrained mapping
            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
            x.dot(w, out=xw)
            zw[:] = z
        else:  # advanced mapping (default for end, acl2018)

            # remove lower-rank transformations
            midpoint = src_size * args.max_align
            src_indices = xp.concatenate((src_indices[:src_size], src_indices[midpoint:midpoint+trg_size]))
            trg_indices = xp.concatenate((trg_indices[:src_size], trg_indices[midpoint:midpoint+trg_size]))
            
            # TODO xw.dot(wx2, out=xw) and alike not working
            xw[:] = x
            zw[:] = z
            
            ### TODO entry point for adding more matrix operations ###

            # STEP 1: Whitening
            ### TODO figure out how weighted k-best affects this (and onwards) ###
            def whitening_transformation(m):
                u, s, vt = xp.linalg.svd(m, full_matrices=False)
                return vt.T.dot(xp.diag(1/s)).dot(vt)
            if args.whiten:
                wx1 = whitening_transformation(xw[src_indices])
                wz1 = whitening_transformation(zw[trg_indices])
                xw = xw.dot(wx1)
                zw = zw.dot(wz1)

            # STEP 2: Orthogonal mapping
            wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices]))
            wz2 = wz2_t.T
            xw = xw.dot(wx2)
            zw = zw.dot(wz2)

            # STEP 3: Re-weighting
            xw *= s**args.src_reweight
            zw *= s**args.trg_reweight

            # STEP 4: De-whitening
            if args.src_dewhiten == 'src':
                xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.src_dewhiten == 'trg':
                xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
            if args.trg_dewhiten == 'src':
                zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.trg_dewhiten == 'trg':
                zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))

            # STEP 5: Dimensionality reduction (default: OFF (0))
            if args.dim_reduction > 0:
                xw = xw[:, :args.dim_reduction]
                zw = zw[:, :args.dim_reduction]

        # Self-learning
        if end:
            break
        else:
            # Update the training dictionary (default direction - union)
            if args.direction in ('forward', 'union'):
                if args.csls_neighborhood > 0:  # default acl2018: 10
                    for i in range(0, trg_size, simbwd.shape[0]):
                        j = min(i + simbwd.shape[0], trg_size)  # get next batch to operate on
                        zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                        knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, src_size, simfwd.shape[0]):
                    j = min(i + simfwd.shape[0], src_size)
                    xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                    simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j])
                    simfwd[:j-i] -= knn_sim_bwd/2  # Equivalent to the real CSLS scores for NN
                    
                    # softmaxing
                    #argsimsf[:] = dropout(-simfwd[:j-i], 1 - keep_prob).argsort(axis=1)[:,:args.max_align]
                    for k in range(args.max_align):
                        argsimsf = dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1)
                        simfwd[:j-i,argsimsf] = -200
                        trg_indices_forward[(k*src_size)+i:(k*src_size)+j] = argsimsf
                        #trg_indices_forward[(k*src_size)+i:(k*src_size)+j] = argsimsf[:,k]
            if args.direction in ('backward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, src_size, simfwd.shape[0]):
                        j = min(i + simfwd.shape[0], src_size)  # get next batch to operate on
                        xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                        knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, trg_size, simbwd.shape[0]):
                    j = min(i + simbwd.shape[0], trg_size)
                    zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                    simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j])
                    simbwd[:j-i] -= knn_sim_fwd/2  # Equivalent to the real CSLS scores for NN
                    
                    # softmaxing
                    #argsimsb[:] = dropout(-simbwd[:j-i], 1 - keep_prob).argsort(axis=1)[:,:args.max_align]
                    for k in range(args.max_align):
                        argsimsb = dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1)
                        simbwd[:j-i,argsimsb] = -200
                        trg_indices_backward[(k*trg_size)+i:(k*trg_size)+j] = argsimsb
                        #src_indices_backward[(k*trg_size)+i:(k*trg_size)+j] = argsimsb[:,k]
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = xp.concatenate((src_indices_forward, src_indices_backward))
                trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            if args.direction == 'forward':
                objective = xp.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = xp.mean(best_sim_backward).tolist()
            elif args.direction == 'union':  # default
                objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2
            if objective - best_objective >= args.threshold:
                last_improvement = it
                best_objective = objective

            # Accuracy and similarity evaluation in validation (default - off)
            if args.validation is not None:
                src = list(validation.keys())
                xw[src].dot(zw.T, out=simval)
                nn = asnumpy(simval.argmax(axis=1))
                accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))])
                similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 * objective), file=sys.stderr)
                print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 * similarity), file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy), file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log)
                log.flush()

        t = time.time()
        it += 1

    # Write mapped embeddings
    srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, zw, trgfile)
    srcfile.close()
    trgfile.close()
    
    # Write dictionary
    dictfile = open(args.dict_output, mode='wb')
    dictalign = list(zip(src_indices, trg_indices))
    pickle.dump(dictalign, dictfile)
Ejemplo n.º 11
0
            trg_words.append(trg)
            trg_ind = trg_word2ind[trg]

            src_indices.append(src_ind)
            trg_indices.append(trg_ind)
        except KeyError:
            print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg),
                  file=sys.stderr)

    # origEnVecs=preprocessing.normalize(en_vec)
    # origForeignVecs=preprocessing.normalize(de_vec)

    subsetEnVecs = en_vec[src_indices]
    subsetForeignVecs = de_vec[trg_indices]

    srcfile = open('en.train',
                   mode='w',
                   encoding='utf-8',
                   errors='surrogateescape')
    trgfile = open('de.train',
                   mode='w',
                   encoding='utf-8',
                   errors='surrogateescape')

    embeddings.write(src_words, subsetEnVecs, srcfile)
    embeddings.write(trg_words, subsetForeignVecs, trgfile)
    source_file.close()
    target_file.close()
    srcfile.close()
    trgfile.close()
Ejemplo n.º 12
0
    source_file = open('new_embedding_size640.en', encoding='utf-8', errors='surrogateescape')
    target_file = open('new_embedding_size640.de', encoding='utf-8', errors='surrogateescape')
    en_words, en_vec = embeddings.read(source_file)
    de_words, de_vec = embeddings.read(target_file)

    en_vec = embeddings.length_normalize(en_vec)
    de_vec = embeddings.length_normalize(de_vec)

    input_view1, input_view2 = Variable(torch.from_numpy(en_vec).cuda()), Variable(torch.from_numpy(de_vec).cuda())

    res_envec, x1, res_devec, x2 = net(input_view1.float(), input_view2.float())
    print(x1)

    src_file = open('BiAE.en', mode='w', encoding='utf-8', errors='surrogateescape')
    trg_file = open('BiAE.de', mode='w', encoding='utf-8', errors='surrogateescape')

    # res_envec = embeddings.length_normalize(res_envec.data.cpu().numpy())
    # res_devec = embeddings.length_normalize(res_devec.data.cpu().numpy())

    res_envec = (res_envec.data.cpu().numpy())
    res_devec = (res_devec.data.cpu().numpy())

    embeddings.write(en_words, res_envec, src_file)
    embeddings.write(de_words, res_devec, trg_file)

    source_file.close()
    target_file.close()
    src_file.close()
    trg_file.close()
    print('Finished Training')
Ejemplo n.º 13
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('sense_input', help='the input sense mapping matrix')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument('tsns_output',
                        default='tsns.pkl',
                        help='the output target senses pickle file')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='the random seed (defaults to 0)')

    recommended_group = parser.add_argument_group(
        'recommended settings', 'Recommended settings for different scenarios')
    recommended_type = recommended_group.add_mutually_exclusive_group()
    recommended_type.add_argument(
        '--unsupervised',
        action='store_true',
        help=
        'recommended if you have no seed dictionary and do not want to rely on identical words'
    )
    recommended_type.add_argument('--future',
                                  action='store_true',
                                  help='experiment with stuff')
    recommended_type.add_argument('--toy',
                                  action='store_true',
                                  help='experiment with stuff on toy dataset')
    recommended_type.add_argument('--acl2018',
                                  action='store_true',
                                  help='reproduce our ACL 2018 system')

    init_group = parser.add_argument_group(
        'advanced initialization arguments',
        'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument('--init_unsupervised',
                           action='store_true',
                           help='use unsupervised initialization')
    init_group.add_argument(
        '--unsupervised_vocab',
        type=int,
        default=0,
        help=
        'restrict the vocabulary to the top k entries for unsupervised initialization'
    )

    mapping_group = parser.add_argument_group(
        'advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'none'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_group.add_argument('--whiten',
                               action='store_true',
                               help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction',
                               type=int,
                               default=0,
                               help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c',
                              '--orthogonal',
                              action='store_true',
                              help='use orthogonal constrained mapping')

    self_learning_group = parser.add_argument_group(
        'advanced self-learning arguments',
        'Advanced arguments for self-learning')
    self_learning_group.add_argument(
        '--vocabulary_cutoff',
        type=int,
        default=0,
        help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--stochastic_initial',
        default=0.1,
        type=float,
        help=
        'initial keep probability stochastic dictionary induction (defaults to 0.1)'
    )
    self_learning_group.add_argument(
        '--stochastic_multiplier',
        default=2.0,
        type=float,
        help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument(
        '--stochastic_interval',
        default=50,
        type=int,
        help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument(
        '--log',
        default='map.log',
        help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')

    future_group = parser.add_argument_group('experimental arguments',
                                             'Experimental arguments')
    future_group.add_argument('--skip_top',
                              type=int,
                              default=0,
                              help='Top k words to skip, presumably function')
    future_group.add_argument(
        '--start_src',
        action='store_true',
        help='Algorithm starts by tuning sense embeddings based on source')
    future_group.add_argument('--trim_senses',
                              action='store_true',
                              help='Trim sense table to working vocab')
    future_group.add_argument(
        '--lamb',
        type=float,
        default=0.5,
        help='Weight hyperparameter for sense alignment objectives')
    future_group.add_argument('--reglamb',
                              type=float,
                              default=1.,
                              help='Lasso regularization hyperparameter')
    future_group.add_argument(
        '--ccreglamb',
        type=float,
        default=0.1,
        help='Sense embedding regularization hyperparameter')
    future_group.add_argument('--inv_delta',
                              type=float,
                              default=0.0001,
                              help='Delta_I added for inverting sense matrix')
    future_group.add_argument('--lasso_iters',
                              type=int,
                              default=10,
                              help='Number of iterations for LASSO/NMF')
    future_group.add_argument('--iterations',
                              type=int,
                              default=-1,
                              help='Number of overall model iterations')
    future_group.add_argument('--trg_batch',
                              type=int,
                              default=5000,
                              help='Batch size for target steps')
    future_group.add_argument(
        '--trg_knn',
        action='store_true',
        help='Perform target sense mapping by k-nearest neighbors')
    future_group.add_argument(
        '--trg_sns_csls',
        type=int,
        default=10,
        help='K-nearest neighbors for CSLS target sense search')
    future_group.add_argument(
        '--senses_per_trg',
        type=int,
        default=1,
        help='K-max target sense mapping (default = 1 = off)')
    future_group.add_argument(
        '--gd',
        action='store_true',
        help='Apply gradient descent for assignment and synset embeddings')
    future_group.add_argument('--gd_lr',
                              type=float,
                              default=1e-2,
                              help='Learning rate for SGD (default=0.01)')
    future_group.add_argument('--gd_wd',
                              action='store_true',
                              help='Weight decay in SGD')
    future_group.add_argument(
        '--gd_wd_hl',
        type=int,
        default=100,
        help='Weight decay half-life in SGD, default=100')
    future_group.add_argument(
        '--gd_clip',
        type=float,
        default=5.,
        help='Per-coordinate gradient clipping (default=5)')
    future_group.add_argument(
        '--gd_map_steps',
        type=int,
        default=1,
        help='Consecutive steps for each target-sense mapping update phase')
    future_group.add_argument(
        '--gd_emb_steps',
        type=int,
        default=1,
        help='Consecutive steps for each sense embedding update phase')
    future_group.add_argument(
        '--base_prox_lambda',
        type=float,
        default=0.99,
        help='Lambda for proximal gradient in lasso step')
    future_group.add_argument(
        '--prox_decay',
        action='store_true',
        help='Multiply proximal lambda by itself each iteration')
    future_group.add_argument(
        '--sense_limit',
        type=float,
        default=1.1,
        help=
        'Maximum amount of target sense mappings, in terms of source mappings (default=1.1x)'
    )
    future_group.add_argument(
        '--gold_pairs',
        help='Gold data for evaluation, if exists (not for tuning)')
    future_group.add_argument(
        '--gold_threshold',
        type=float,
        default=0.0,
        help='Threshold for gold mapping (0 is fine if sparse)')

    future_group.add_argument('--debug', action='store_true')

    args = parser.parse_args()

    # pre-setting groups
    if args.toy:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=50,
                            trim_senses=True,
                            inv_delta=1.,
                            reglamb=0.2,
                            lasso_iters=100,
                            gd_wd=True,
                            log='map-toy.log')
    if args.unsupervised or args.future:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=2000,
                            trim_senses=True,
                            gd_wd=True)
    if args.unsupervised or args.acl2018:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            vocabulary_cutoff=20000)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None
            or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'  # many operations not supported by cupy
    elif args.precision == 'fp32':  # default
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    print('reading embeddings...')
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)
    print('embeddings read')

    # Read input source sense mapping
    print('reading sense mapping')
    src_senses = pickle.load(open(args.sense_input, 'rb'))
    if src_senses.shape[0] != x.shape[0]:
        src_senses = csr_matrix(src_senses.transpose()
                                )  # using non-cuda scipy because of 'inv' impl
    #src_senses = get_sparse_module(src_senses)
    print(
        f'source sense mapping of shape {src_senses.shape} loaded with {src_senses.getnnz()} nonzeros'
    )

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
        print('CUDA loaded')
    else:
        xp = np
    xp.random.seed(args.seed)

    # removed word to index map (only relevant in supervised learning or with validation)

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)
    print('normalization complete')

    # removed building the seed dictionary

    # removed validation step

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
        print(f'logging into {args.log}')

    # Allocate memory

    # Initialize the projection matrices W(s) = W(t) = I.
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)
    xw[:] = x
    zw[:] = z

    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(
        x.shape[0] - args.skip_top, args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(
        z.shape[0] - args.skip_top, args.vocabulary_cutoff)
    emb_dim = x.shape[1]

    cutoff_end = min(src_size + args.skip_top, x.shape[0])

    if args.trim_senses:
        # reshape sense assignment
        src_senses = src_senses[args.skip_top:cutoff_end]

        # new columns for words with no senses in original input
        ### TODO might also need this if not trimming (probably kinda far away)
        newcols = [csc_matrix(([1],([i],[0])),shape=(src_size,1)) for i in range(src_size)\
                   if src_senses.getrow(i).getnnz() == 0]
        #with open(f'data/synsets/dummy_synsets_v3b_{src_size}','wb') as dummy_cols_file:
        #    dummy_col_idcs = [i for i in range(src_size) if src_senses.getrow(i).getnnz() == 0]
        #    pickle.dump(np.array(dummy_col_idcs), dummy_cols_file)

        # trim senses no longer used, add new ones
        colsums = src_senses.sum(axis=0).tolist()[0]
        kept_senses = [i for i, j in enumerate(colsums) if j > 0]
        #with open(f'data/synsets/kept_synsets_v3b_{src_size}','wb') as kept_save_file:
        #    pickle.dump(np.array(kept_senses), kept_save_file)
        src_senses = hstack([src_senses[:, kept_senses]] + newcols)
        print(
            f'trimmed sense dictionary dimensions: {src_senses.shape} with {src_senses.getnnz()} nonzeros'
        )
    sense_size = src_senses.shape[1]

    if args.gold_pairs is not None:
        with open(args.gold_pairs, 'rb') as gold_pairs_f:
            gold_pairs = pickle.load(gold_pairs_f)
            gold_pairs = [(i-args.skip_top,j) for i,j in gold_pairs \
                          if i >= args.skip_top and i < src_senses.shape[0] and j < src_senses.shape[1]]
        gold_trgs = sorted(set([x[0] for x in gold_pairs]))
        gold_senses = sorted(set([x[1] for x in gold_pairs]))
        gold_domain_size = len(gold_trgs) * len(gold_senses)
        print(
            f'evaluating on {len(gold_pairs)} pairs with {len(gold_trgs)} unique words and {len(gold_senses)} unique senses'
        )

    # Initialize the concept embeddings from the source embeddings
    ### TODO maybe try gradient descent instead?
    ### TODO (pre-)create non-singular alignment matrix
    cc = xp.empty((sense_size, emb_dim), dtype=dtype)  # \tilde{E}
    t01 = time.time()
    print('starting psinv calc')
    src_sns_psinv = psinv(src_senses, dtype, args.inv_delta)
    xecc = x[args.skip_top:cutoff_end].T.dot(
        get_sparse_module(src_senses).toarray()).T  # sense_size * emb_dim
    cc[:] = src_sns_psinv.dot(xecc)
    print(f'initialized concept embeddings in {time.time()-t01:.2f} seconds',
          file=sys.stderr)
    if args.verbose:
        # report precision of psedo-inverse operation, checked by inverting
        pseudo_id = src_senses.transpose().dot(src_senses).dot(
            src_sns_psinv.get())
        real_id = sparse_id(sense_size)
        rel_diff = (pseudo_id - real_id).sum() / (sense_size * sense_size)
        print(f'per-coordinate pseudo-inverse precision is {rel_diff:.5f}')

    ### TODO initialize trg_senses using seed dictionary instead?
    trg_sns_size = trg_size if args.trim_senses else z.shape[0]
    trg_senses = csr_matrix(
        (trg_sns_size,
         sense_size))  # using non-cuda scipy because of 'inv' impl
    zecc = xp.empty_like(xecc)  # sense_size * emb_dim
    #tg_grad = xp.empty((trg_sns_size, sense_size))

    if args.gd:
        # everything can be done on gpu
        src_senses = get_sparse_module(src_senses, dtype=dtype)
        trg_senses = get_sparse_module(trg_senses, dtype=dtype)
        if args.sense_limit > 0.0:
            trg_sense_limit = int(args.sense_limit * src_senses.getnnz())
            if args.verbose:
                print(
                    f'limiting target side to {trg_sense_limit} sense mappings'
                )
        else:
            trg_sense_limit = -1

    ### TODO return memory assignment for similarities?

    # Training loop
    if args.gd:
        prox_lambda = args.base_prox_lambda
    else:
        lasso_model = Lasso(alpha=args.reglamb, fit_intercept=False, max_iter=args.lasso_iters,\
                            positive=True, warm_start=True)  # TODO more parametrization

    if args.log is not None:
        if args.gd:
            print(f'gradient descent lr: {args.gd_lr}', file=log)
            print(f'base proximal lambda: {args.base_prox_lambda}', file=log)
        else:
            print(f'lasso regularization: {args.reglamb}', file=log)
            print(f'lasso iterations: {args.lasso_iters}', file=log)
            print(f'inversion epsilon: {args.inv_delta}', file=log)
        if args.gold_pairs is not None:
            print(f'gold mappings: {len(gold_pairs)}', file=log)
        print(
            f'Iteration\tObjective\tSource\tTarget\tL_1\tDuration\tNonzeros\tCorrect_mappings',
            file=log)
        log.flush()

    best_objective = objective = 1000000000.
    correct_mappings = -1
    regularization_lambda = args.base_prox_lambda if args.gd else args.reglamb
    it = 1
    last_improvement = 0
    t = time.time()
    map_gd_lr = args.gd_lr
    emb_gd_lr = args.gd_lr
    end = False
    print('starting training')

    if args.start_src:
        print('starting with converging synset embeddings')
        it_range = range(
            args.iterations
        )  ### TODO possibly add arg, but there's early stopping
        if not args.verbose:
            it_range = tqdm(it_range)
        prev_obj = float('inf')
        for pre_it in it_range:
            if args.gd_wd:
                emb_gd_lr = args.gd_lr * pow(0.5, floor(
                    pre_it / args.gd_wd_hl))

            # Synset embedding
            cc_grad = src_senses.T.dot(
                xw[args.skip_top:cutoff_end] -
                src_senses.dot(cc)) - args.ccreglamb * cc
            cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad)
            cc += emb_gd_lr * cc_grad

            # Source projection
            u, s, vt = xp.linalg.svd(cc.T.dot(xecc))
            wx = vt.T.dot(u.T).astype(dtype)
            x.dot(wx, out=xw)

            pre_objective = ((xp.linalg.norm(
                xw[args.skip_top:cutoff_end] -
                get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2
            pre_objective = float(pre_objective)

            if args.verbose and pre_it > 0 and pre_it % 10 == 0:
                print(
                    f'source synset embedding objective iteration {pre_it}: {pre_objective:.3f}'
                )

            if pre_objective > prev_obj:
                print(
                    f'stopping at pre-iteration {pre_it}, source-sense objective {prev_obj:.3f}'
                )
                # revert
                cc -= emb_gd_lr * cc_grad
                break

            prev_obj = pre_objective

    while True:
        if it % 50 == 0:
            print(
                f'starting iteration {it}, last objective was {objective}, correct mappings at {correct_mappings}'
            )

        # Increase the keep probability if we have not improved in args.stochastic_interval iterations
        if it - last_improvement > args.stochastic_interval:
            last_improvement = it

        if args.iterations > 0 and it > args.iterations:
            end = True

        ### update target assignments (6) - lasso-esque regression
        time6 = time.time()
        # optimize: 0.5 * (xp.linalg.norm(zw[i] - trg_senses[i].dot(cc))^2) + (regularization_lambda * xp.linalg.norm(trg_senses[i],1))

        if args.trg_knn:
            # for csls-based neighborhoods
            knn_sense = xp.full(sense_size, -100)
            for i in range(0, sense_size, args.trg_batch):
                batch_end = min(i + args.trg_batch, sense_size)
                sim_sense_trg = cc[i:batch_end].dot(
                    zw[args.skip_top:cutoff_end].T)
                knn_sense[i:batch_end] = topk_mean(sim_sense_trg,
                                                   k=args.trg_sns_csls,
                                                   inplace=True)

            # calculate new target mappings
            trg_senses = lil_matrix(trg_senses.shape)
            for i in range(0, trg_size, args.trg_batch):
                sns_batch_end = min(i + args.trg_batch, trg_size)
                z_i = i + args.skip_top
                z_batch_end = min(sns_batch_end + args.skip_top, zw.shape[0])

                sims = zw[z_i:z_batch_end].dot(cc.T)
                sims -= knn_sense / 2  # equivalent to the real CSLS scores for NN
                best_idcs = sims.argmax(1).tolist()
                trg_senses[(list(range(i, sns_batch_end)),
                            best_idcs)] = sims.max(1).tolist()

                # second-to-lth-best
                for l in range(args.senses_per_trg - 1):
                    sims[(list(range(sims.shape[0])), best_idcs)] = 0.
                    best_idcs = sims.argmax(1).tolist()
                    trg_senses[(list(range(i, sns_batch_end)),
                                best_idcs)] = sims.max(1).tolist()

            trg_senses = get_sparse_module(trg_senses.tocsr())

        elif args.gd:
            ### TODO add args.skip_top calculations
            if args.gd_wd:
                true_it = (it - 1) * args.gd_map_steps
                map_gd_lr = args.gd_lr * pow(
                    0.5, floor((1 + true_it) / args.gd_wd_hl))
                if args.verbose:
                    print(f'mapping learning rate: {map_gd_lr}')

            for k in range(args.gd_map_steps):
                # st <- st + eta * (ew - st.dot(es)).dot(es.T)
                # allow up to sense_limit updates, clip gradient

                batch_grads = []
                for i in range(0, trg_size, args.trg_batch):
                    batch_end = min(i + args.trg_batch, trg_size)
                    tg_grad_b = (zw[i:batch_end] -
                                 trg_senses[i:batch_end].dot(cc)).dot(cc.T)

                    # proximal gradient
                    tg_grad_b += prox_lambda
                    tg_grad_b.clip(None, 0.0, out=tg_grad_b)
                    batch_grads.append(batch_sparse(tg_grad_b))

                tg_grad = get_sparse_module(vstack(batch_grads))
                del tg_grad_b

                if args.prox_decay:
                    prox_lambda *= args.base_prox_lambda

                ### TODO consider weight decay here as well (args.gd_wd)
                trg_senses -= map_gd_lr * tg_grad

                # allow up to sense_limit nonzeros
                if trg_sense_limit > 0:
                    trg_senses = trim_sparse(trg_senses,
                                             trg_sense_limit,
                                             clip=None)

            ### TODO consider finishing up with lasso (maybe only in final iteration)

        else:
            ### TODO add args.skip_top calculations
            # parallel LASSO (no cuda impl)
            cccpu = cc.get().T  # emb_dim * sense_size
            lasso_model.fit(cccpu, zw[:trg_size].get().T)
            ### TODO maybe trim, keep only above some threshold (0.05) OR top f(#it)
            trg_senses = lasso_model.sparse_coef_

        if args.verbose:
            print(
                f'target sense mapping step: {(time.time()-time6):.2f} seconds, {trg_senses.getnnz()} nonzeros',
                file=sys.stderr)
            objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro') ** 2)\
                            + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \
                        + regularization_lambda * trg_senses.sum()  # TODO consider thresholding reg part
            objective = float(objective)
            print(f'objective: {objective:.3f}')

        # Write target sense mapping
        with open(f'tmp_outs/{args.tsns_output[:-4]}-it{it:03d}.pkl',
                  mode='wb') as tsnsfile:
            pickle.dump(trg_senses.get(), tsnsfile)

        ### update synset embeddings (10)
        time10 = time.time()
        if args.gd and args.gd_emb_steps > 0:
            ### TODO probably handle sizes and/or threshold sparse matrix
            if args.gd_wd:
                true_it = (it - 1) * args.gd_emb_steps
                emb_gd_lr = args.gd_lr * pow(
                    0.5, floor((1 + true_it) / args.gd_wd_hl))
                if args.verbose:
                    print(f'embedding learning rate: {emb_gd_lr}')

            ### replace block for no-source-tuning mode
            all_senses = trg_senses if args.start_src else get_sparse_module(
                vstack((src_senses.get(), trg_senses.get()), format='csr'),
                dtype=dtype)
            aw = zw[args.
                    skip_top:cutoff_end] if args.start_src else xp.concatenate(
                        (xw[args.skip_top:cutoff_end],
                         zw[args.skip_top:cutoff_end]))

            for i in range(args.gd_emb_steps):
                cc_grad = all_senses.T.dot(
                    aw - all_senses.dot(cc)) - args.ccreglamb * cc
                cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad)
                cc += emb_gd_lr * cc_grad

        else:
            ### TODO add args.skip_top calculations
            all_senses = get_sparse_module(
                vstack((src_senses, trg_senses), format='csr'))
            xzecc = xp.concatenate((xw[:src_size], zw[:trg_size])).T\
                        .dot(all_senses.toarray()).T  # sense_size * emb_dim
            all_sns_psinv = psinv(
                all_senses.get(), dtype, args.inv_delta
            )  ### TODO only update target side? We still have src_sns_psinv [it doesn't matter, dimensions are the same]
            cc[:] = all_sns_psinv.dot(xzecc)

        if args.verbose:
            print(f'synset embedding update: {time.time()-time10:.2f}',
                  file=sys.stderr)
            objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro')) ** 2\
                            + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \
                        + regularization_lambda * trg_senses.sum()  # TODO consider thresholding reg part
            objective = float(objective)
            print(f'objective: {objective:.3f}')

        ### update projections (3,5)
        # write to zw and xw
        if args.orthogonal or not end:

            ### remove block for no-source-tuning mode
            # source side - mappings don't change so xecc is constant
            #if not args.start_src:  # need to do this anyway whenever cc updates
            time3 = time.time()
            u, s, vt = xp.linalg.svd(cc.T.dot(xecc))
            wx = vt.T.dot(u.T).astype(dtype)
            x.dot(wx, out=xw)
            if args.verbose:
                print(f'source projection update: {time.time()-time3:.2f}',
                      file=sys.stderr)

            # target side - compute sense mapping first
            time3 = time.time()
            zecc.fill(0.)
            for i in range(0, trg_size, args.trg_batch):
                end_idx = min(i + args.trg_batch, trg_size)
                zecc += z[i:end_idx].T.dot(
                    get_sparse_module(trg_senses[i:end_idx]).toarray()).T
            u, s, vt = xp.linalg.svd(cc.T.dot(zecc))
            wz = vt.T.dot(u.T).astype(dtype)
            z.dot(wz, out=zw)
            if args.verbose:
                print(f'target projection update: {time.time()-time3:.2f}',
                      file=sys.stderr)

        ### TODO add parts from 'advanced mapping' part - transformations, whitening, etc.

        # Objective function evaluation
        time_obj = time.time()
        trg_senses_l1 = float(trg_senses.sum())
        src_obj = (float(
            xp.linalg.norm(
                xw[args.skip_top:cutoff_end] -
                get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2
        trg_obj = (float(
            xp.linalg.norm(
                zw[args.skip_top:cutoff_end] -
                get_sparse_module(trg_senses).dot(cc), 'fro'))**2) / 2
        objective = src_obj + trg_obj + regularization_lambda * trg_senses_l1  # TODO consider thresholding reg part
        if args.verbose:
            print(f'objective calculation: {time.time()-time_obj:.2f}',
                  file=sys.stderr)

        if objective - best_objective <= -args.threshold:
            last_improvement = it
            best_objective = objective

        # WordNet transduction evaluation (can't tune on this)
        if args.gold_pairs is not None:
            np_trg_senses = trg_senses.get()
            trg_corr = [
                p for p in gold_pairs if np_trg_senses[p] > args.gold_threshold
            ]
            correct_mappings = len(trg_corr)
            domain_trgs = np_trg_senses[gold_trgs][:, gold_senses]
        else:
            correct_mappings = -1

        # Logging
        duration = time.time() - t
        if args.verbose:
            print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                  file=sys.stderr)
            print('objective: {0:.3f}'.format(objective), file=sys.stderr)
            print('target senses l_1 norm: {0:.3f}'.format(trg_senses_l1),
                  file=sys.stderr)
            if len(gold_pairs) > 0 and domain_trgs.getnnz() > 0:
                print(
                    f'{correct_mappings} correct target mappings: {(correct_mappings/len(gold_pairs)):.3f} recall, {(correct_mappings/domain_trgs.getnnz()):.3f} precision',
                    file=sys.stderr)
            print(file=sys.stderr)
            sys.stderr.flush()
        if args.log is not None:
            print(
                f'{it}\t{objective:.3f}\t{src_obj:.3f}\t{trg_obj:.3f}\t{trg_senses_l1:.3f}\t{duration:.3f}\t{trg_senses.getnnz()}\t{correct_mappings}',
                file=log)
            log.flush()

        if end:
            break

        t = time.time()
        it += 1

    # Write mapped embeddings
    with open(args.src_output,
              mode='w',
              encoding=args.encoding,
              errors='surrogateescape') as srcfile:
        embeddings.write(src_words, xw, srcfile)
    with open(args.trg_output,
              mode='w',
              encoding=args.encoding,
              errors='surrogateescape') as trgfile:
        embeddings.write(trg_words, zw, trgfile)

    # Write target sense mapping
    with open(args.tsns_output, mode='wb') as tsnsfile:
        pickle.dump(trg_senses.get(), tsnsfile)
Ejemplo n.º 14
0
def main():
    # Parse command line arguments
    # https://docs.python.org/3/library/argparse.html
    parser = argparse.ArgumentParser(
        description='Map word embeddings in two languages into a shared space')
    # description - This argument gives a brief description of what the program does and how it works.
    parser.add_argument('src_input', help='the input source embeddings')
    # help - A brief description of what the argument does.
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    # -- optional
    # default - The value produced if the argument is absent from the command line.
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    # choices - A container of the allowable values for the argument.
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    # action - The basic type of action to be taken when this argument is encountered at the command line.
    # store_ture - store true value
    parser.add_argument(
        '--batch_size',
        default=1000,
        type=int,
        help=
        'batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory'
    )
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='the random seed (defaults to 0)')
    parser.add_argument('--draw',
                        action='store_true',
                        help='use seaborn to draw')

    recommended_group = parser.add_argument_group(
        'recommended settings', 'Recommended settings for different scenarios')
    # add_argument_group() - returns an argument group object which has an add_argument() method just like a regular ArgumentParser.
    # it's a better conceptual grouping of arguments than this default one
    recommended_type = recommended_group.add_mutually_exclusive_group()
    # argparse will make sure that only one of the arguments in the mutually exclusive group was present on the command line
    recommended_type.add_argument(
        '--supervised',
        metavar='DICTIONARY',
        help='recommended if you have a large training dictionary')
    recommended_type.add_argument(
        '--semi_supervised',
        metavar='DICTIONARY',
        help='recommended if you have a small seed dictionary')
    recommended_type.add_argument(
        '--identical',
        action='store_true',
        help=
        'recommended if you have no seed dictionary but can rely on identical words'
    )
    recommended_type.add_argument(
        '--unsupervised',
        action='store_true',
        help=
        'recommended if you have no seed dictionary and do not want to rely on identical words'
    )
    recommended_type.add_argument('--acl2018',
                                  action='store_true',
                                  help='reproduce our ACL 2018 system')
    recommended_type.add_argument('--aaai2018',
                                  metavar='DICTIONARY',
                                  help='reproduce our AAAI 2018 system')
    # A name for the argument in usage messages
    recommended_type.add_argument(
        '--acl2017',
        action='store_true',
        help='reproduce our ACL 2017 system with numeral initialization')
    recommended_type.add_argument(
        '--acl2017_seed',
        metavar='DICTIONARY',
        help='reproduce our ACL 2017 system with a seed dictionary')
    recommended_type.add_argument('--emnlp2016',
                                  metavar='DICTIONARY',
                                  help='reproduce our EMNLP 2016 system')

    init_group = parser.add_argument_group(
        'advanced initialization arguments',
        'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument(
        '-d',
        '--init_dictionary',
        default=sys.stdin.fileno(),
        metavar='DICTIONARY',
        help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical',
                           action='store_true',
                           help='use identical words as the seed dictionary')
    init_type.add_argument(
        '--init_numerals',
        action='store_true',
        help=
        'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary'
    )
    init_type.add_argument('--init_unsupervised',
                           action='store_true',
                           help='use unsupervised initialization')
    init_group.add_argument(
        '--unsupervised_vocab',
        type=int,
        default=0,
        help=
        'restrict the vocabulary to the top k entries for unsupervised initialization'
    )

    mapping_group = parser.add_argument_group(
        'advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'none'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    # no normalization in default
    mapping_group.add_argument('--whiten',
                               action='store_true',
                               help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight',
                               type=float,
                               default=0,
                               nargs='?',
                               const=1,
                               help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten',
                               choices=['src', 'trg'],
                               help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction',
                               type=int,
                               default=0,
                               help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c',
                              '--orthogonal',
                              action='store_true',
                              help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u',
                              '--unconstrained',
                              action='store_true',
                              help='use unconstrained mapping')

    self_learning_group = parser.add_argument_group(
        'advanced self-learning arguments',
        'Advanced arguments for self-learning')
    self_learning_group.add_argument('--self_learning',
                                     action='store_true',
                                     help='enable self-learning')
    self_learning_group.add_argument(
        '--vocabulary_cutoff',
        type=int,
        default=0,
        help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument(
        '--direction',
        choices=['forward', 'backward', 'union'],
        default='union',
        help='the direction for dictionary induction (defaults to union)')
    self_learning_group.add_argument('--csls',
                                     type=int,
                                     nargs='?',
                                     default=0,
                                     const=10,
                                     metavar='NEIGHBORHOOD_SIZE',
                                     dest='csls_neighborhood',
                                     help='use CSLS for dictionary induction')
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--validation',
        default=None,
        metavar='DICTIONARY',
        help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument(
        '--stochastic_initial',
        default=0.1,
        type=float,
        help=
        'initial keep probability stochastic dictionary induction (defaults to 0.1)'
    )
    self_learning_group.add_argument(
        '--stochastic_multiplier',
        default=2.0,
        type=float,
        help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument(
        '--stochastic_interval',
        default=50,
        type=int,
        help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument(
        '--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')
    args = parser.parse_args()

    if args.supervised is not None:
        parser.set_defaults(init_dictionary=args.supervised,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            batch_size=1000)
    if args.semi_supervised is not None:
        parser.set_defaults(init_dictionary=args.semi_supervised,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            self_learning=True,
                            vocabulary_cutoff=20000,
                            csls_neighborhood=10)
    if args.identical:
        parser.set_defaults(init_identical=True,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            self_learning=True,
                            vocabulary_cutoff=20000,
                            csls_neighborhood=10)
    if args.unsupervised or args.acl2018:
        parser.set_defaults(init_unsupervised=True,
                            unsupervised_vocab=4000,
                            normalize=['unit', 'center', 'unit'],
                            whiten=True,
                            src_reweight=0.5,
                            trg_reweight=0.5,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            self_learning=True,
                            vocabulary_cutoff=20000,
                            csls_neighborhood=10)
    if args.aaai2018:
        parser.set_defaults(init_dictionary=args.aaai2018,
                            normalize=['unit', 'center'],
                            whiten=True,
                            trg_reweight=1,
                            src_dewhiten='src',
                            trg_dewhiten='trg',
                            batch_size=1000)
    if args.acl2017:
        parser.set_defaults(init_numerals=True,
                            orthogonal=True,
                            normalize=['unit', 'center'],
                            self_learning=True,
                            direction='forward',
                            stochastic_initial=1.0,
                            stochastic_interval=1,
                            batch_size=1000)
    if args.acl2017_seed:
        parser.set_defaults(init_dictionary=args.acl2017_seed,
                            orthogonal=True,
                            normalize=['unit', 'center'],
                            self_learning=True,
                            direction='forward',
                            stochastic_initial=1.0,
                            stochastic_interval=1,
                            batch_size=1000)
    if args.emnlp2016:
        parser.set_defaults(init_dictionary=args.emnlp2016,
                            orthogonal=True,
                            normalize=['unit', 'center'],
                            batch_size=1000)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None
            or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    # fix random seed
    xp.random.seed(args.seed)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)

    # Build the seed dictionary
    src_indices = []
    trg_indices = []
    dict_size = 5000
    if args.init_unsupervised:
        sim_size = min(x.shape[0],
                       z.shape[0]) if args.unsupervised_vocab <= 0 else min(
                           x.shape[0], z.shape[0], args.unsupervised_vocab)
        u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False)
        xsim = (u * s).dot(u.T)
        u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False)
        zsim = (u * s).dot(u.T)
        del u, s, vt
        xsim.sort(axis=1)
        zsim.sort(axis=1)
        embeddings.normalize(xsim, args.normalize)
        embeddings.normalize(zsim, args.normalize)
        sim = xsim.dot(zsim.T)
        if args.csls_neighborhood > 0:
            knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood)
            knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood)
            sim -= knn_sim_fwd[:, xp.newaxis] / 2 + knn_sim_bwd / 2
        if args.direction == 'forward':
            src_indices = xp.arange(sim_size)
            trg_indices = sim.argmax(axis=1)
        elif args.direction == 'backward':
            src_indices = sim.argmax(axis=0)
            trg_indices = xp.arange(sim_size)
        elif args.direction == 'union':
            src_indices = xp.concatenate(
                (xp.arange(sim_size), sim.argmax(axis=0)))
            trg_indices = xp.concatenate(
                (sim.argmax(axis=1), xp.arange(sim_size)))
        del xsim, zsim, sim
    elif args.init_numerals:
        numeral_regex = re.compile('^[0-9]+$')
        # ^ match from start of words $ match to end of words
        # consider numbers from 0 to 9
        # http://www.runoob.com/python/python-reg-expressions.html
        src_numerals = {
            word
            for word in src_words if numeral_regex.match(word) is not None
        }
        trg_numerals = {
            word
            for word in trg_words if numeral_regex.match(word) is not None
        }
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    elif args.init_identical:
        identical = set(src_words).intersection(set(trg_words))
        for word in identical:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.init_dictionary,
                 encoding=args.encoding,
                 errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)
            if len(src_indices) == dict_size:
                break
    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation,
                 encoding=args.encoding,
                 errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')

    # Allocate memory
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)
    # choose to cut-off or not
    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(
        x.shape[0], args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(
        z.shape[0], args.vocabulary_cutoff)
    simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype)
    simbwd = xp.empty((args.batch_size, src_size), dtype=dtype)
    if args.validation is not None:
        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    best_sim_forward = xp.full(src_size, -100, dtype=dtype)
    src_indices_forward = xp.arange(src_size)
    trg_indices_forward = xp.zeros(src_size, dtype=int)
    best_sim_backward = xp.full(trg_size, -100, dtype=dtype)
    src_indices_backward = xp.zeros(trg_size, dtype=int)
    trg_indices_backward = xp.arange(trg_size)
    knn_sim_fwd = xp.zeros(src_size, dtype=dtype)
    knn_sim_bwd = xp.zeros(trg_size, dtype=dtype)

    # Training loop
    best_objective = objective = -100.
    it = 1
    last_improvement = 0
    keep_prob = args.stochastic_initial
    t = time.time()
    end = not args.self_learning
    while True:

        # Increase the keep probability if we have not improve in args.stochastic_interval iterations
        # for init-numeral : if objective doesn's increase after 1 iteration, then stop it directly
        if it - last_improvement > args.stochastic_interval:
            if keep_prob >= 1.0:
                end = True
            keep_prob = min(1.0, args.stochastic_multiplier * keep_prob)
            last_improvement = it

        # Update the embedding mapping
        if args.orthogonal or not end:  # orthogonal mapping
            u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw[:] = z
        elif args.unconstrained:  # unconstrained mapping
            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(
                x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
            x.dot(w, out=xw)
            zw[:] = z
        else:  # advanced mapping

            # TODO xw.dot(wx2, out=xw) and alike not working
            xw[:] = x
            zw[:] = z

            # STEP 1: Whitening
            def whitening_transformation(m):
                u, s, vt = xp.linalg.svd(m, full_matrices=False)
                return vt.T.dot(xp.diag(1 / s)).dot(vt)

            if args.whiten:
                wx1 = whitening_transformation(xw[src_indices])
                wz1 = whitening_transformation(zw[trg_indices])
                xw = xw.dot(wx1)
                zw = zw.dot(wz1)

            # STEP 2: Orthogonal mapping
            wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(
                zw[trg_indices]))
            wz2 = wz2_t.T
            xw = xw.dot(wx2)
            zw = zw.dot(wz2)

            # STEP 3: Re-weighting
            xw *= s**args.src_reweight
            zw *= s**args.trg_reweight

            # STEP 4: De-whitening
            if args.src_dewhiten == 'src':
                xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.src_dewhiten == 'trg':
                xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
            if args.trg_dewhiten == 'src':
                zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.trg_dewhiten == 'trg':
                zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))

            # STEP 5: Dimensionality reduction
            if args.dim_reduction > 0:
                xw = xw[:, :args.dim_reduction]
                zw = zw[:, :args.dim_reduction]

        # Self-learning
        if end:
            break
        else:
            # Update the training dictionary
            if args.direction in ('forward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, trg_size, simbwd.shape[0]):
                        j = min(i + simbwd.shape[0], trg_size)
                        zw[i:j].dot(xw[:src_size].T, out=simbwd[:j - i])
                        knn_sim_bwd[i:j] = topk_mean(simbwd[:j - i],
                                                     k=args.csls_neighborhood,
                                                     inplace=True)
                for i in range(0, src_size, simfwd.shape[0]):
                    j = min(i + simfwd.shape[0], src_size)
                    xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j - i])
                    simfwd[:j - i].max(axis=1, out=best_sim_forward[i:j])
                    simfwd[:j -
                           i] -= knn_sim_bwd / 2  # Equivalent to the real CSLS scores for NN
                    dropout(simfwd[:j - i],
                            1 - keep_prob).argmax(axis=1,
                                                  out=trg_indices_forward[i:j])
            if args.direction in ('backward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, src_size, simfwd.shape[0]):
                        j = min(i + simfwd.shape[0], src_size)
                        xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j - i])
                        knn_sim_fwd[i:j] = topk_mean(simfwd[:j - i],
                                                     k=args.csls_neighborhood,
                                                     inplace=True)
                for i in range(0, trg_size, simbwd.shape[0]):
                    j = min(i + simbwd.shape[0], trg_size)
                    zw[i:j].dot(xw[:src_size].T, out=simbwd[:j - i])
                    simbwd[:j - i].max(axis=1, out=best_sim_backward[i:j])
                    simbwd[:j -
                           i] -= knn_sim_fwd / 2  # Equivalent to the real CSLS scores for NN
                    dropout(simbwd[:j - i], 1 - keep_prob).argmax(
                        axis=1, out=src_indices_backward[i:j])
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = xp.concatenate(
                    (src_indices_forward, src_indices_backward))
                trg_indices = xp.concatenate(
                    (trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            if args.direction == 'forward':
                objective = xp.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = xp.mean(best_sim_backward).tolist()
            elif args.direction == 'union':
                objective = (xp.mean(best_sim_forward) +
                             xp.mean(best_sim_backward)).tolist() / 2
            if objective - best_objective >= args.threshold:
                last_improvement = it
                best_objective = objective

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                src = list(validation.keys())
                xw[src].dot(zw.T, out=simval)
                nn = asnumpy(simval.argmax(axis=1))
                accuracy = np.mean([
                    1 if nn[i] in validation[src[i]] else 0
                    for i in range(len(src))
                ])
                similarity = np.mean([
                    max([simval[i, j].tolist() for j in validation[src[i]]])
                    for i in range(len(src))
                ])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                      file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 *
                                                               objective),
                      file=sys.stderr)
                print(
                    '\t- Drop probability: {0:9.4f}%'.format(100 -
                                                             100 * keep_prob),
                    file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 *
                                                                   similarity),
                          file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 *
                                                                   accuracy),
                          file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(
                        100 * validation_coverage),
                          file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 *
                    validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(
                    it, 100 * objective, val, duration),
                      file=log)
                log.flush()

        t = time.time()
        it += 1

    # draw distribution of language space
    if args.draw:
        PCA_model = PCA(n_components=2)
        x_PCA = PCA_model.fit_transform(asnumpy(xw))
        x1 = [feature[0] for feature in x_PCA]
        y1 = [feature[1] for feature in x_PCA]
        z_PCA = PCA_model.fit_transform(asnumpy(zw))
        x2 = [feature[0] for feature in z_PCA]
        y2 = [feature[1] for feature in z_PCA]
        '''
        # draw with plt
        plt.scatter(x2, y2, s=10, c='r', alpha=0.4)
        plt.scatter(x1, y1, s=10, c='b', alpha=0.2)
        plt.savefig('./share_space.png')
        '''
        # draw with seaborn
        plt.figure()
        sns.jointplot(x1, y1, kind='hex', color='b')
        plt.savefig('./src_mapped_emb.png')
        plt.figure()
        sns.jointplot(x2, y2, kind='hex', color='g')
        plt.savefig('./trg_mapped_emb.png')

    # Write mapped embeddings
    srcfile = open(args.src_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, zw, trgfile)
    srcfile.close()
    trgfile.close()
Ejemplo n.º 15
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Generate latent space embeddings')
    parser.add_argument('emb1', help='path to embedding 1')
    parser.add_argument('emb2', help='path to embedding 2')
    parser.add_argument(
        '--geomm_embeddings_path',
        default=None,
        type=str,
        help=
        'directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.'
    )
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--verbose', default=0, type=int, help='Verbose')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument('--dictionary',
                               default=sys.stdin.fileno(),
                               help='the dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'no'],
        nargs=2,
        default=[],
        help=
        'the normalization actions performed in sequence for embeddings 1 and 2'
    )

    geomm_group = parser.add_argument_group('GeoMM arguments',
                                            'Arguments for GeoMM method')
    geomm_group.add_argument('--l2_reg',
                             type=float,
                             default=1e2,
                             help='Lambda for L2 Regularization')
    geomm_group.add_argument(
        '--max_opt_time',
        type=int,
        default=5000,
        help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument(
        '--max_opt_iter',
        type=int,
        default=150,
        help='Maximum number of iterations for optimization')

    args = parser.parse_args()

    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'
    if args.verbose:
        print('Loading embeddings data...')

    # Read input embeddings
    emb1file = open(args.emb1,
                    encoding=args.encoding,
                    errors='surrogateescape')
    emb2file = open(args.emb2,
                    encoding=args.encoding,
                    errors='surrogateescape')
    emb1_words, x = embeddings.read(emb1file, max_voc=0, dtype=dtype)
    emb2_words, z = embeddings.read(emb2file, max_voc=0, dtype=dtype)

    # Build word to index map
    emb1_word2ind = {word: i for i, word in enumerate(emb1_words)}
    emb2_word2ind = {word: i for i, word in enumerate(emb2_words)}

    noov = 0
    emb1_indices = []
    emb2_indices = []
    f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape')
    for line in f:
        emb1, emb2 = line.split()
        try:
            emb1_ind = emb1_word2ind[emb1]
            emb2_ind = emb2_word2ind[emb2]
            emb1_indices.append(emb1_ind)
            emb2_indices.append(emb2_ind)
        except KeyError:
            noov += 1
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    emb1, emb2))  #, file=sys.stderr
    f.close()
    if args.verbose:
        print('Number of embedding pairs having at least one OOV: {}'.format(
            noov))
    emb1_indices = emb1_indices
    emb2_indices = emb2_indices
    if args.verbose:
        print('Normalizing embeddings...')

    # STEP 0: Normalization
    if len(args.normalize) > 0:
        x = normalize_emb(x, args.normalize[0])
        z = normalize_emb(z, args.normalize[1])

    # Step 1: Optimization
    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    x_count = len(set(emb1_indices))
    z_count = len(set(emb2_indices))

    # Filter out uniq values
    map_dict_emb1 = {}
    map_dict_emb2 = {}
    I = 0
    uniq_emb1 = []
    uniq_emb2 = []
    for i in range(len(emb1_indices)):
        if emb1_indices[i] not in map_dict_emb1.keys():
            map_dict_emb1[emb1_indices[i]] = I
            I += 1
            uniq_emb1.append(emb1_indices[i])
    J = 0
    for j in range(len(emb2_indices)):
        if emb2_indices[j] not in map_dict_emb2.keys():
            map_dict_emb2[emb2_indices[j]] = J
            J += 1
            uniq_emb2.append(emb2_indices[j])

    # Creating dictionary matrix
    row = list(range(0, x_count))
    col = list(range(0, x_count))
    data = [1 for i in range(0, x_count)]
    print(f"Counts: {x_count}, {z_count}")
    A = coo_matrix((data, (row, col)), shape=(x_count, z_count))

    np.random.seed(0)
    Lambda = args.l2_reg

    U1 = TT.matrix()
    U2 = TT.matrix()
    B = TT.matrix()

    Xemb1 = x[uniq_emb1]
    Zemb2 = z[uniq_emb2]
    del x, z
    gc.collect()

    Kx, Kz = Xemb1, Zemb2
    XtAZ = Kx.T.dot(A.dot(Kz))
    XtX = Kx.T.dot(Kx)
    ZtZ = Kz.T.dot(Kz)
    AA = np.sum(A * A)

    W = (U1.dot(B)).dot(U2.T)
    regularizer = 0.5 * Lambda * (TT.sum(B**2))
    sXtX = shared(XtX)
    sZtZ = shared(ZtZ)
    sXtAZ = shared(XtAZ)

    cost = regularizer
    wtxtxw = W.T.dot(sXtX.dot(W))
    wtxtxwztz = wtxtxw.dot(sZtZ)
    cost += TT.nlinalg.trace(wtxtxwztz)
    cost += -2 * TT.sum(W * sXtAZ)
    cost += shared(AA)

    solver = ConjugateGradient(maxtime=args.max_opt_time,
                               maxiter=args.max_opt_iter)

    manifold = Product([
        Stiefel(Kx.shape[1], Kx.shape[1]),
        Stiefel(Kz.shape[1], Kz.shape[1]),
        PositiveDefinite(Kx.shape[1])
    ])
    problem = Problem(manifold=manifold,
                      cost=cost,
                      arg=[U1, U2, B],
                      verbosity=3)
    wopt = solver.solve(problem)
    print(f"Problem solved ...")

    w = wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]

    print(f"Model copied ...")

    gc.collect()

    # Step 2: Transformation
    xw = Kx.dot(U1).dot(scipy.linalg.sqrtm(B))
    zw = Kz.dot(U2).dot(scipy.linalg.sqrtm(B))
    print(f"Transformation done ...")

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time -
                                                             start_time))

    del Kx, Kz, B, U1, U2
    gc.collect()

    ### Save the GeoMM embeddings if requested
    xw_n = embeddings.length_normalize(xw)
    zw_n = embeddings.length_normalize(zw)

    del xw, zw
    gc.collect()

    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path, exist_ok=True)

        out_emb_fname = os.path.join(args.geomm_embeddings_path, 'emb1.vec')
        new_emb1_words = []
        for id in uniq_emb1:
            new_emb1_words.append(emb1_words[id])
        with open(out_emb_fname, 'w', encoding=args.encoding) as outfile:
            embeddings.write(new_emb1_words, xw_n, outfile)

        new_emb2_words = []
        for id in uniq_emb2:
            new_emb2_words.append(emb2_words[id])
        out_emb_fname = os.path.join(args.geomm_embeddings_path, 'emb2.vec')
        with open(out_emb_fname, 'w', encoding=args.encoding) as outfile:
            embeddings.write(new_emb2_words, zw_n, outfile)

    exit(0)
Ejemplo n.º 16
0
    en_vec = embeddings.length_normalize(en_vec)
    de_vec = embeddings.length_normalize(de_vec)

    input_view1, input_view2 = Variable(
        torch.from_numpy(en_vec).cuda()), Variable(
            torch.from_numpy(de_vec).cuda())

    res_envec = net(input_view1.float())

    src_file = open('LinearMappingres.en',
                    mode='w',
                    encoding='utf-8',
                    errors='surrogateescape')
    trg_file = open('LinearMappingres.de',
                    mode='w',
                    encoding='utf-8',
                    errors='surrogateescape')

    res_envec = embeddings.length_normalize(res_envec.data.cpu().numpy())

    embeddings.write(en_words, res_envec, src_file)
    embeddings.write(de_words,
                     input_view2.float().data.cpu().numpy(), trg_file)

    source_file.close()
    target_file.close()
    src_file.close()
    trg_file.close()
    print('Finished Training')
    # print(net.view1_fc.weight.data)
Ejemplo n.º 17
0
def share_embedding(words, matrix):
    f = open('word.txt', mode='w', encoding='utf-8', errors='surrogateescape')
    embeddings.write(words, matrix, f)
Ejemplo n.º 18
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map the source embeddings into the target embedding space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('--model_path', default=None, type=str, help='directory to save the model')
    parser.add_argument('--geomm_embeddings_path', default=None, type=str, help='directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--max_vocab', default=0,type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary')
    parser.add_argument('--verbose', default=0,type=int, help='Verbose')
    mapping_group = parser.add_argument_group('mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument('-dtrain', '--dictionary_train', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument('-dtest', '--dictionary_test', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order')

    geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method')
    geomm_group.add_argument('--l2_reg', type=float,default=1e2, help='Lambda for L2 Regularization')
    geomm_group.add_argument('--max_opt_time', type=int,default=5000, help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument('--max_opt_iter', type=int,default=150, help='Maximum number of iterations for optimization')

    eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation')
    eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time')
    eval_group.add_argument('--eval_batch_size', type=int,default=1000, help='Batch size for evaluation')
    eval_group.add_argument('--csls_neighbourhood', type=int,default=10, help='Neighbourhood size for CSLS')

    args = parser.parse_args()
    BATCH_SIZE = args.eval_batch_size

    ## Logging
    #method_name = os.path.join('logs','geomm')
    #directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    #if not os.path.exists(directory):
    #    os.makedirs(directory)
    #log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train))
    #log_file_name = log_file_name + '.log'
    #class Logger(object):
    #    def __init__(self):
    #        self.terminal = sys.stdout
    #        self.log = open(os.path.join(directory,log_file_name), "a")

    #    def write(self, message):
    #        self.terminal.write(message)
    #        self.log.write(message)

    #    def flush(self):
    #        #this flush method is needed for python 3 compatibility.
    #        #this handles the flush command by doing nothing.
    #        #you might want to specify some extra behavior here.
    #        pass
    #sys.stdout = Logger()
    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'
    if args.verbose:
        print('Loading train data...')
    # Read input embeddings
    srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
    src_words, x = embeddings.read(srcfile,max_voc=args.max_vocab, dtype=dtype)
    trg_words, z = embeddings.read(trgfile,max_voc=args.max_vocab, dtype=dtype)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    noov=0
    src_indices = []
    trg_indices = []
    f = open(args.dictionary_train, encoding=args.encoding, errors='surrogateescape')
    for line in f:
        src,trg = line.split()
        if args.max_vocab:
            src=src.lower()
            trg=trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src_indices.append(src_ind)
            trg_indices.append(trg_ind)
        except KeyError:
            noov+=1
            if args.verbose:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg)) #, file=sys.stderr
    f.close()
    if args.verbose:
        print('Number of training pairs having at least one OOV: {}'.format(noov))
    src_indices = src_indices
    trg_indices = trg_indices
    if args.verbose:
        print('Normalizing embeddings...')
    # STEP 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)


    # Step 1: Optimization
    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    x_count = len(set(src_indices))
    z_count = len(set(trg_indices))
    A = np.zeros((x_count,z_count))

    # Creating dictionary matrix from training set
    map_dict_src={}
    map_dict_trg={}
    I=0
    uniq_src=[]
    uniq_trg=[]
    for i in range(len(src_indices)):
        if src_indices[i] not in map_dict_src.keys():
            map_dict_src[src_indices[i]]=I
            I+=1
            uniq_src.append(src_indices[i])
    J=0
    for j in range(len(trg_indices)):
        if trg_indices[j] not in map_dict_trg.keys():
            map_dict_trg[trg_indices[j]]=J
            J+=1
            uniq_trg.append(trg_indices[j])

    for i in range(len(src_indices)):
        A[map_dict_src[src_indices[i]],map_dict_trg[trg_indices[i]]]=1

    np.random.seed(0)
    Lambda=args.l2_reg

    U1 = TT.matrix()
    U2 = TT.matrix()
    B  = TT.matrix()

    Kx, Kz = x[uniq_src], z[uniq_trg]
    XtAZ = Kx.T.dot(A.dot(Kz))
    XtX = Kx.T.dot(Kx)
    ZtZ = Kz.T.dot(Kz)
    # AA = np.sum(A*A) # this can be added if cost needs to be compared to original geomm

    W = (U1.dot(B)).dot(U2.T)
    regularizer = 0.5*Lambda*(TT.sum(B**2))
    sXtX = shared(XtX)
    sZtZ = shared(ZtZ)
    sXtAZ = shared(XtAZ)

    cost = regularizer
    wtxtxw = W.T.dot(sXtX.dot(W))
    wtxtxwztz = wtxtxw.dot(sZtZ)
    cost += TT.nlinalg.trace(wtxtxwztz)
    cost += -2 * TT.sum(W * sXtAZ)
    # cost += shared(AA) # this can be added if cost needs to be compared with original geomm

    solver = ConjugateGradient(maxtime=args.max_opt_time,maxiter=args.max_opt_iter)

    manifold =Product([Stiefel(x.shape[1], x.shape[1]),Stiefel(z.shape[1], x.shape[1]),PositiveDefinite(x.shape[1])])
    #manifold =Product([Stiefel(x.shape[1], 200),Stiefel(z.shape[1], 200),PositiveDefinite(200)])
    problem = Problem(manifold=manifold, cost=cost, arg=[U1,U2,B], verbosity=3)
    wopt = solver.solve(problem)

    w= wopt
    U1 = w[0]
    U2 = w[1]
    B = w[2]

    ### Save the models if requested
    if args.model_path is not None:
        os.makedirs(args.model_path,exist_ok=True)
        np.savetxt('{}/U_src.csv'.format(args.model_path),U1)
        np.savetxt('{}/U_tgt.csv'.format(args.model_path),U2)
        np.savetxt('{}/B.csv'.format(args.model_path),B)

    # Step 2: Transformation
    xw = x.dot(U1).dot(scipy.linalg.sqrtm(B))
    zw = z.dot(U2).dot(scipy.linalg.sqrtm(B))

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time-start_time))
    gc.collect()

    ### Save the GeoMM embeddings if requested
    xw_n = embeddings.length_normalize(xw)
    zw_n = embeddings.length_normalize(zw)
    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path,exist_ok=True)

        out_emb_fname=os.path.join(args.geomm_embeddings_path,'src.vec')
        with open(out_emb_fname,'w',encoding=args.encoding) as outfile:
            embeddings.write(src_words,xw_n,outfile)

        out_emb_fname=os.path.join(args.geomm_embeddings_path,'trg.vec')
        with open(out_emb_fname,'w',encoding=args.encoding) as outfile:
            embeddings.write(trg_words,zw_n,outfile)

    # Step 3: Evaluation
    if args.normalize_eval:
        xw = xw_n
        zw = zw_n

    X = xw[src_indices]
    Z = zw[trg_indices]

    # Loading test dictionary
    f = open(args.dictionary_test, encoding=args.encoding, errors='surrogateescape')
    src2trg = collections.defaultdict(set)
    trg2src = collections.defaultdict(set)
    oov = set()
    vocab = set()
    for line in f:
        src, trg = line.split()
        if args.max_vocab:
            src=src.lower()
            trg=trg.lower()
        try:
            src_ind = src_word2ind[src]
            trg_ind = trg_word2ind[trg]
            src2trg[src_ind].add(trg_ind)
            trg2src[trg_ind].add(src_ind)
            vocab.add(src)
        except KeyError:
            oov.add(src)
    src = list(src2trg.keys())
    trgt = list(trg2src.keys())

    oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
    coverage = len(src2trg) / (len(src2trg) + len(oov))
    f.close()

    translation = collections.defaultdict(int)
    translation5 = collections.defaultdict(list)
    translation10 = collections.defaultdict(list)

    ### compute nearest neigbours of x in z
    t=time.time()
    nbrhood_x=np.zeros(xw.shape[0])

    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities_x = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1)
        nbrhood_x[src[i:j]]=np.mean(similarities_x[:,:args.csls_neighbourhood],axis=1)

    ### compute nearest neigbours of z in x (GPU version)
    nbrhood_z=np.zeros(zw.shape[0])
    with cp.cuda.Device(0):
        nbrhood_z2=cp.zeros(zw.shape[0])
        batch_num=1
        for i in range(0, zw.shape[0], BATCH_SIZE):
            j = min(i + BATCH_SIZE, zw.shape[0])
            similarities = -1*cp.partition(-1*cp.dot(cp.asarray(zw[i:j]),cp.transpose(cp.asarray(xw))),args.csls_neighbourhood-1 ,axis=1)[:,:args.csls_neighbourhood]
            nbrhood_z2[i:j]=(cp.mean(similarities[:,:args.csls_neighbourhood],axis=1))
            batch_num+=1
        nbrhood_z=cp.asnumpy(nbrhood_z2)

    #### compute nearest neigbours of z in x (CPU version)
    #nbrhood_z=np.zeros(zw.shape[0])
    #for i in range(0, len(zw.shape[0]), BATCH_SIZE):
    #    j = min(i + BATCH_SIZE, len(zw.shape[0]))
    #    similarities = zw[i:j].dot(xw.T)
    #    similarities_z = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1)
    #    nbrhood_z[i:j]=np.mean(similarities_z[:,:args.csls_neighbourhood],axis=1)

    #### find translation
    #for i in range(0, len(src), BATCH_SIZE):
    #    j = min(i + BATCH_SIZE, len(src))
    #    similarities = xw[src[i:j]].dot(zw.T)
    #    similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
    #    nn = similarities.argmax(axis=1).tolist()
    #    similarities = np.argsort((similarities),axis=1)

    #    nn5 = (similarities[:,-5:])
    #    nn10 = (similarities[:,-10:])
    #    for k in range(j-i):
    #        translation[src[i+k]] = nn[k]
    #        translation5[src[i+k]] = nn5[k]
    #        translation10[src[i+k]] = nn10[k]


    #if args.geomm_embeddings_path is not None:
    #    delim=','
    #    os.makedirs(args.geomm_embeddings_path,exist_ok=True)

    #    translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv')
    #    with open(translations_fname,'w',encoding=args.encoding) as translations_file:
    #        for src_id in src:
    #            src_word = src_words[src_id]
    #            all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ]
    #            trgout_words = [ trg_words[j] for j in translation10[src_id] ]
    #            ss = list(nn10[src_id,:])
    #
    #            p1 = ':'.join(all_trg_words)
    #            p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] )
    #            translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, delim=delim, p1=p1, p2=p2) )

    ### find translation  (and write to file if output requested)
    delim=','
    translations_file =None
    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path,exist_ok=True)
        translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv')
        translations_file = open(translations_fname,'w',encoding=args.encoding)

    for i in range(0, len(src), BATCH_SIZE):
        j = min(i + BATCH_SIZE, len(src))
        similarities = xw[src[i:j]].dot(zw.T)
        similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z
        nn = similarities.argmax(axis=1).tolist()
        similarities = np.argsort((similarities),axis=1)

        nn5 = (similarities[:,-5:])
        nn10 = (similarities[:,-10:])
        for k in range(j-i):
            translation[src[i+k]] = nn[k]
            translation5[src[i+k]] = nn5[k]
            translation10[src[i+k]] = nn10[k]


            if args.geomm_embeddings_path is not None:
                src_id=src[i+k]
                src_word = src_words[src_id]
                all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ]
                trgout_words = [ trg_words[j] for j in translation10[src_id] ]
                #ss = list(nn10[src_id,:])

                p1 = ':'.join(all_trg_words)
                p2 = ':'.join(trgout_words)
                #p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] )
                translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, p1=p1, p2=p2, delim=delim) )

    if args.geomm_embeddings_path is not None:
        translations_file.close()

    accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src])
    mean=0
    for i in src:
        for k in translation5[i]:
            if k in src2trg[i]:
                mean+=1
                break

    mean/=len(src)
    accuracy5 = mean

    mean=0
    for i in src:
        for k in translation10[i]:
            if k in src2trg[i]:
                mean+=1
                break

    mean/=len(src)
    accuracy10 = mean
    message = src_input.split(".")[-2] + "-->" + trg_input.split(".")[-2] + ":"
        'Coverage:{0:7.2%}  Accuracy:{1:7.2%}'.format(coverage, accuracy)
Ejemplo n.º 19
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp64',
                        help='the floating-point precision (defaults to fp64)')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)')
    mapping_group.add_argument(
        '-d',
        '--dictionary',
        default=sys.stdin.fileno(),
        help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c',
                              '--orthogonal',
                              action='store_true',
                              help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u',
                              '--unconstrained',
                              action='store_true',
                              help='use unconstrained mapping')
    self_learning_group = parser.add_argument_group(
        'self-learning arguments',
        'Optional arguments for self-learning (ACL 2017)')
    self_learning_group.add_argument('--self_learning',
                                     action='store_true',
                                     help='enable self-learning')
    self_learning_group.add_argument(
        '--direction',
        choices=['forward', 'backward', 'union'],
        default='forward',
        help='the direction for dictionary induction (defaults to forward)')
    self_learning_group.add_argument(
        '--numerals',
        action='store_true',
        help=
        'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary'
    )
    self_learning_group.add_argument(
        '--threshold',
        default=0.000001,
        type=float,
        help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument(
        '--validation',
        default=None,
        help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument(
        '--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')
    advanced_group = parser.add_argument_group(
        'advanced mapping arguments',
        'Advanced embedding mapping arguments (AAAI 2018)')
    advanced_group.add_argument('--whiten',
                                action='store_true',
                                help='whiten the embeddings')
    advanced_group.add_argument(
        '--src_reweight',
        type=float,
        default=0,
        nargs='?',
        const=1,
        help='re-weight the source language embeddings')
    advanced_group.add_argument(
        '--trg_reweight',
        type=float,
        default=0,
        nargs='?',
        const=1,
        help='re-weight the target language embeddings')
    advanced_group.add_argument(
        '--src_dewhiten',
        choices=['src', 'trg'],
        help='de-whiten the source language embeddings')
    advanced_group.add_argument(
        '--trg_dewhiten',
        choices=['src', 'trg'],
        help='de-whiten the target language embeddings')
    advanced_group.add_argument('--dim_reduction',
                                type=int,
                                default=0,
                                help='apply dimensionality reduction')
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None
            or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_input,
                   encoding=args.encoding,
                   errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)

    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build training dictionary
    src_indices = []
    trg_indices = []
    if args.numerals:
        if args.dictionary != sys.stdin.fileno():
            print('WARNING: Using numerals instead of the training dictionary',
                  file=sys.stderr)
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {
            word
            for word in src_words if numeral_regex.match(word) is not None
        }
        trg_numerals = {
            word
            for word in trg_words if numeral_regex.match(word) is not None
        }
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.dictionary,
                 encoding=args.encoding,
                 errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation,
                 encoding=args.encoding,
                 errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')

    # STEP 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            x = embeddings.length_normalize(x)
            z = embeddings.length_normalize(z)
        elif action == 'center':
            x = embeddings.mean_center(x)
            z = embeddings.mean_center(z)
        elif action == 'unitdim':
            x = embeddings.length_normalize_dimensionwise(x)
            z = embeddings.length_normalize_dimensionwise(z)
        elif action == 'centeremb':
            x = embeddings.mean_center_embeddingwise(x)
            z = embeddings.mean_center_embeddingwise(z)

    # Training loop
    prev_objective = objective = -100.
    it = 1
    t = time.time()
    while it == 1 or objective - prev_objective >= args.threshold:

        # Update the embedding mapping
        if args.orthogonal:  # orthogonal mapping
            u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            xw = x.dot(w)
            zw = z
        elif args.unconstrained:  # unconstrained mapping
            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(
                x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
            xw = x.dot(w)
            zw = z
        else:  # advanced mapping
            xw = x
            zw = z

            # STEP 1: Whitening
            def whitening_transformation(m):
                u, s, vt = xp.linalg.svd(m, full_matrices=False)
                return vt.T.dot(xp.diag(1 / s)).dot(vt)

            if args.whiten:
                wx1 = whitening_transformation(xw[src_indices])
                wz1 = whitening_transformation(zw[trg_indices])
                xw = xw.dot(wx1)
                zw = zw.dot(wz1)

            # STEP 2: Orthogonal mapping
            wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(
                zw[trg_indices]))
            wz2 = wz2_t.T
            xw = xw.dot(wx2)
            zw = zw.dot(wz2)

            # STEP 3: Re-weighting
            xw *= s**args.src_reweight
            zw *= s**args.trg_reweight

            # STEP 4: De-whitening
            if args.src_dewhiten == 'src':
                xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.src_dewhiten == 'trg':
                xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
            if args.trg_dewhiten == 'src':
                zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
            elif args.trg_dewhiten == 'trg':
                zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))

            # STEP 5: Dimensionality reduction
            if args.dim_reduction > 0:
                xw = xw[:, :args.dim_reduction]
                zw = zw[:, :args.dim_reduction]

        # Self-learning
        if args.self_learning:

            # Update the training dictionary
            best_sim_forward = xp.full(x.shape[0], -100, dtype=dtype)
            src_indices_forward = xp.arange(x.shape[0])
            trg_indices_forward = xp.zeros(x.shape[0], dtype=int)
            best_sim_backward = xp.full(z.shape[0], -100, dtype=dtype)
            src_indices_backward = xp.zeros(z.shape[0], dtype=int)
            trg_indices_backward = xp.arange(z.shape[0])
            for i in range(0, x.shape[0], MAX_DIM_X):
                j = min(x.shape[0], i + MAX_DIM_X)
                for k in range(0, z.shape[0], MAX_DIM_Z):
                    l = min(z.shape[0], k + MAX_DIM_Z)
                    sim = xw[i:j].dot(zw[k:l].T)
                    if args.direction in ('forward', 'union'):
                        ind = sim.argmax(axis=1)
                        val = sim[xp.arange(sim.shape[0]), ind]
                        ind += k
                        mask = (val > best_sim_forward[i:j])
                        best_sim_forward[i:j][mask] = val[mask]
                        trg_indices_forward[i:j][mask] = ind[mask]
                    if args.direction in ('backward', 'union'):
                        ind = sim.argmax(axis=0)
                        val = sim[ind, xp.arange(sim.shape[1])]
                        ind += i
                        mask = (val > best_sim_backward[k:l])
                        best_sim_backward[k:l][mask] = val[mask]
                        src_indices_backward[k:l][mask] = ind[mask]
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = xp.concatenate(
                    (src_indices_forward, src_indices_backward))
                trg_indices = xp.concatenate(
                    (trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            prev_objective = objective
            if args.direction == 'forward':
                objective = xp.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = xp.mean(best_sim_backward).tolist()
            elif args.direction == 'union':
                objective = (xp.mean(best_sim_forward) +
                             xp.mean(best_sim_backward)).tolist() / 2

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                src = list(validation.keys())
                sim = xw[src].dot(zw.T)  # TODO Assuming that it fits in memory
                nn = asnumpy(sim.argmax(axis=1))
                accuracy = np.mean([
                    1 if nn[i] in validation[src[i]] else 0
                    for i in range(len(src))
                ])
                similarity = np.mean([
                    max([sim[i, j].tolist() for j in validation[src[i]]])
                    for i in range(len(src))
                ])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                      file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 *
                                                               objective),
                      file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 *
                                                                   similarity),
                          file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 *
                                                                   accuracy),
                          file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(
                        100 * validation_coverage),
                          file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 *
                    validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(
                    it, 100 * objective, val, duration),
                      file=log)
                log.flush()

        t = time.time()
        it += 1

    # Write mapped embeddings
    srcfile = open(args.src_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    trgfile = open(args.trg_output,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, zw, trgfile)
    srcfile.close()
    trgfile.close()
Ejemplo n.º 20
0
def add_oov_embeddings(train_dict_fname,
                       test_dict_fname,
                       src_emb_fname,
                       tgt_emb_fname,
                       out_src_emb_fname,
                       out_tgt_emb_fname,
                       src_model_path,
                       tgt_model_path,
                       fast_text_binary_path,
                       max_voc=200000,
                       emb_format='txt'):
    """
    Adds the embeddings for OOV words in the training and test dictionaries to the embedding file. 
    This is done by computing the embeddings using FastText. So, this method applies to FastText 
    embeddings only. Note that the output embedding file will contain only the OOV words plus 
    the first max_voc words in the original embedding file.
    
    train_dict_fname: 
    test_dict_fname: 
    src_emb_fname: embedding file for source language 
    tgt_emb_fname: embedding file for target language
    out_src_emb_fname: output embedding file for source language 
    out_tgt_emb_fname: output embedding file for target language    
    src_model_path: fasttext model for source language 
    tgt_model_path: fasttext model for targetqa language 
    fast_text_binary_path: path to fasttext binary
    max_voc: number of vocab items to process from the embedding file
    emb_format: format of embedding files. Currently supported: 'txt' - standard fast text format
    """

    ## read dictionaries
    train_dict = read_dict(train_dict_fname)
    test_dict = read_dict(test_dict_fname)

    # read embeddings
    src_vcb_words = None
    src_emb = None
    tgt_vcb_words = None
    tgt_emb = None

    with open(src_emb_fname, 'r', encoding='utf-8' ) as src_emb_file, \
         open(tgt_emb_fname, 'r', encoding='utf-8' ) as tgt_emb_file:
        src_vcb_words, src_emb = embeddings.read(src_emb_file, max_voc)
        tgt_vcb_words, tgt_emb = embeddings.read(tgt_emb_file, max_voc)

    ## find OOVs
    src_oov_words = set()
    src_oov_words.update(train_dict.keys())
    src_oov_words.update(test_dict.keys())
    src_oov_words.difference_update(src_vcb_words)
    print('Number of src OOV words: {}'.format(len(src_oov_words)))

    tgt_oov_words = set()
    tgt_oov_words.update(train_dict.values())
    tgt_oov_words.update(test_dict.values())
    tgt_oov_words.difference_update(tgt_vcb_words)
    print('Number of tgt OOV words: {}'.format(len(tgt_oov_words)))

    ## compute embeddings for OOV
    ##### cat queries.txt | ./fasttext print-word-vectors model.bin
    src_oov_final_words, src_oov_emb = compute_fasttext_embeddings(
        src_oov_words, src_model_path, fast_text_binary_path)
    tgt_oov_final_words, tgt_oov_emb = compute_fasttext_embeddings(
        tgt_oov_words, tgt_model_path, fast_text_binary_path)

    if (len(src_oov_words) != len(src_oov_final_words)):
        print(
            'WARNING: Embeddings not computed for {} words out of {} OOV source words'
            .format(
                len(src_oov_words) - len(src_oov_final_words),
                len(src_oov_words)))

    if (len(tgt_oov_words) != len(tgt_oov_final_words)):
        print(
            'WARNING: Embeddings not computed for {} words out of {} OOV target words'
            .format(
                len(tgt_oov_words) - len(tgt_oov_final_words),
                len(tgt_oov_words)))

    ## write new embeddings files to disk
    ## put the OOV words first followed by words in the original embeddings file
    with open(out_src_emb_fname, 'w', encoding='utf-8' ) as out_src_emb_file, \
         open(out_tgt_emb_fname, 'w', encoding='utf-8' ) as out_tgt_emb_file:
        embeddings.write(src_oov_final_words + src_vcb_words,
                         np.concatenate([src_oov_emb, src_emb]),
                         out_src_emb_file)
        embeddings.write(tgt_oov_final_words + tgt_vcb_words,
                         np.concatenate([tgt_oov_emb, tgt_emb]),
                         out_tgt_emb_file)
Ejemplo n.º 21
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory')
    parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)')

    recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios')
    recommended_type = recommended_group.add_mutually_exclusive_group()
    recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary')
    recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary')
    recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words')
    recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words')

    init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary')
    init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary')
    init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization')
    init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization')

    mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order')
    mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings')
    mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings')
    mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings')
    mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings')
    mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings')
    mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping')

    self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning')
    self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning')
    self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)')
    self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction')
    self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)')
    self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)')
    self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)')
    self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)')
    self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration')
    args = parser.parse_args()

    if args.supervised is not None:
        parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
    if args.semi_supervised is not None:
        parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.identical:
        parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    if args.unsupervised:
        parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
    args = parser.parse_args()

    # Check command line arguments
    if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten:
        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
        sys.exit(-1)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    # Read input embeddings
    srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
    src_words, x = embeddings.read(srcfile, dtype=dtype)
    trg_words, z = embeddings.read(trgfile, dtype=dtype)

    np.random.seed(args.seed)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # STEP 0: Normalization
    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)

    # Build the seed dictionary
    src_indices = []
    trg_indices = []
    if args.init_unsupervised:
        sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab)
        u, s, vt = np.linalg.svd(x[:sim_size], full_matrices=False)
        xsim = (u*s).dot(u.T)
        u, s, vt = np.linalg.svd(z[:sim_size], full_matrices=False)
        zsim = (u*s).dot(u.T)
        del u, s, vt
        xsim.sort(axis=1)
        zsim.sort(axis=1)
        embeddings.normalize(xsim, args.normalize)
        embeddings.normalize(zsim, args.normalize)
        sim = xsim.dot(zsim.T)
        if args.csls_neighborhood > 0:
            knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood)
            knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood)
            sim -= knn_sim_fwd[:, np.newaxis]/2 + knn_sim_bwd/2
        if args.direction == 'forward':
            src_indices = np.arange(sim_size)
            trg_indices = sim.argmax(axis=1)
        elif args.direction == 'backward':
            src_indices = sim.argmax(axis=0)
            trg_indices = np.arange(sim_size)
        elif args.direction == 'union':
            src_indices = np.concatenate((np.arange(sim_size), sim.argmax(axis=0)))
            trg_indices = np.concatenate((sim.argmax(axis=1), np.arange(sim_size)))
        del xsim, zsim, sim
    elif args.init_numerals:
        numeral_regex = re.compile('^[0-9]+$')
        src_numerals = {word for word in src_words if numeral_regex.match(word) is not None}
        trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None}
        numerals = src_numerals.intersection(trg_numerals)
        for word in numerals:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    elif args.init_identical:
        identical = set(src_words).intersection(set(trg_words))
        for word in identical:
            src_indices.append(src_word2ind[word])
            trg_indices.append(trg_word2ind[word])
    else:
        f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape')
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        f = open(args.validation, encoding=args.encoding, errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            src, trg = line.split()
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape')

    # Allocate memory
    xw = np.empty_like(x)
    zw = np.empty_like(z)
    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff)
    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff)
    simfwd = np.empty((args.batch_size, trg_size), dtype=dtype)
    simbwd = np.empty((args.batch_size, src_size), dtype=dtype)
    if args.validation is not None:
        simval = np.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    best_sim_forward = np.full(src_size, -100, dtype=dtype)
    src_indices_forward = np.arange(src_size)
    trg_indices_forward = np.zeros(src_size, dtype=int)
    best_sim_backward = np.full(trg_size, -100, dtype=dtype)
    src_indices_backward = np.zeros(trg_size, dtype=int)
    trg_indices_backward = np.arange(trg_size)
    knn_sim_fwd = np.zeros(src_size, dtype=dtype)
    knn_sim_bwd = np.zeros(trg_size, dtype=dtype)

    # Training loop
    best_objective = objective = -100.
    it = 1
    last_improvement = 0
    keep_prob = args.stochastic_initial
    t = time.time()
    end = not args.self_learning
    while True:

        # Increase the keep probability if we have not improve in args.stochastic_interval iterations
        if it - last_improvement > args.stochastic_interval:
            if keep_prob >= 1.0:
                end = True
            keep_prob = min(1.0, args.stochastic_multiplier*keep_prob)
            last_improvement = it

        # Update the embedding mapping
        if args.orthogonal or not end:  # orthogonal mapping
            u, s, vt = np.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw[:] = z
        elif args.unconstrained:  # unconstrained mapping
            x_pseudoinv = np.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T)
            w = x_pseudoinv.dot(z[trg_indices])
            x.dot(w, out=xw)
            zw[:] = z
        else:  # advanced mapping

            # TODO xw.dot(wx2, out=xw) and alike not working
            xw[:] = x
            zw[:] = z

            # STEP 1: Whitening
            def whitening_transformation(m):
                u, s, vt = np.linalg.svd(m, full_matrices=False)
                return vt.T.dot(np.diag(1/s)).dot(vt)
            if args.whiten:
                wx1 = whitening_transformation(xw[src_indices])
                wz1 = whitening_transformation(zw[trg_indices])
                xw = xw.dot(wx1)
                zw = zw.dot(wz1)

            # STEP 2: Orthogonal mapping
            wx2, s, wz2_t = np.linalg.svd(xw[src_indices].T.dot(zw[trg_indices]))
            wz2 = wz2_t.T
            xw = xw.dot(wx2)
            zw = zw.dot(wz2)

            # STEP 3: Re-weighting
            xw *= s**args.src_reweight
            zw *= s**args.trg_reweight

            # STEP 4: De-whitening
            if args.src_dewhiten == 'src':
                xw = xw.dot(wx2.T.dot(np.linalg.inv(wx1)).dot(wx2))
            elif args.src_dewhiten == 'trg':
                xw = xw.dot(wz2.T.dot(np.linalg.inv(wz1)).dot(wz2))
            if args.trg_dewhiten == 'src':
                zw = zw.dot(wx2.T.dot(np.linalg.inv(wx1)).dot(wx2))
            elif args.trg_dewhiten == 'trg':
                zw = zw.dot(wz2.T.dot(np.linalg.inv(wz1)).dot(wz2))

            # STEP 5: Dimensionality reduction
            if args.dim_reduction > 0:
                xw = xw[:, :args.dim_reduction]
                zw = zw[:, :args.dim_reduction]

        # Self-learning
        if end:
            break
        else:
            # Update the training dictionary
            if args.direction in ('forward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, trg_size, simbwd.shape[0]):
                        j = min(i + simbwd.shape[0], trg_size)
                        zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                        knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, src_size, simfwd.shape[0]):
                    j = min(i + simfwd.shape[0], src_size)
                    xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                    simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j])
                    simfwd[:j-i] -= knn_sim_bwd/2  # Equivalent to the real CSLS scores for NN
                    dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j])
            if args.direction in ('backward', 'union'):
                if args.csls_neighborhood > 0:
                    for i in range(0, src_size, simfwd.shape[0]):
                        j = min(i + simfwd.shape[0], src_size)
                        xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
                        knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True)
                for i in range(0, trg_size, simbwd.shape[0]):
                    j = min(i + simbwd.shape[0], trg_size)
                    zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
                    simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j])
                    simbwd[:j-i] -= knn_sim_fwd/2  # Equivalent to the real CSLS scores for NN
                    dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j])
            if args.direction == 'forward':
                src_indices = src_indices_forward
                trg_indices = trg_indices_forward
            elif args.direction == 'backward':
                src_indices = src_indices_backward
                trg_indices = trg_indices_backward
            elif args.direction == 'union':
                src_indices = np.concatenate((src_indices_forward, src_indices_backward))
                trg_indices = np.concatenate((trg_indices_forward, trg_indices_backward))

            # Objective function evaluation
            if args.direction == 'forward':
                objective = np.mean(best_sim_forward).tolist()
            elif args.direction == 'backward':
                objective = np.mean(best_sim_backward).tolist()
            elif args.direction == 'union':
                objective = (np.mean(best_sim_forward) + np.mean(best_sim_backward)).tolist() / 2
            if objective - best_objective >= args.threshold:
                last_improvement = it
                best_objective = objective

            # Accuracy and similarity evaluation in validation
            if args.validation is not None:
                src = list(validation.keys())
                xw[src].dot(zw.T, out=simval)
                nn = asnumpy(simval.argmax(axis=1))
                accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))])
                similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))])

            # Logging
            duration = time.time() - t
            if args.verbose:
                print(file=sys.stderr)
                print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr)
                print('\t- Objective:        {0:9.4f}%'.format(100 * objective), file=sys.stderr)
                print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr)
                if args.validation is not None:
                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 * similarity), file=sys.stderr)
                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy), file=sys.stderr)
                    print('\t- Val. coverage:    {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr)
                sys.stderr.flush()
            if args.log is not None:
                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                    100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else ''
                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log)
                log.flush()

        t = time.time()
        it += 1

    # Write mapped embeddings
    srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape')
    trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape')
    embeddings.write(src_words, xw, srcfile)
    embeddings.write(trg_words, zw, trgfile)
    srcfile.close()
    trgfile.close()
Ejemplo n.º 22
0
def share_embedding(words, matrix):
    f = open('D:/TestData/predata/miandian_Position_50.txt',
             mode='w',
             encoding='utf-8',
             errors='surrogateescape')
    embeddings.write(words, matrix, f)
Ejemplo n.º 23
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Generate meta embeddings')
    parser.add_argument('emb1', help='path to embedding 1')
    parser.add_argument('emb2', help='path to embedding 2')
    parser.add_argument('--method',
                        choices=['avg', 'conc'],
                        default=['avg'],
                        type=str,
                        nargs=1,
                        help='meta embedding generation method')
    parser.add_argument('--meta_embeddings_path',
                        default='./',
                        type=str,
                        help='directory to save the output meta embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--verbose', default=0, type=int, help='Verbose')
    parser.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'no'],
        nargs=2,
        default=[],
        help=
        'the normalization actions performed in sequence for embeddings 1 and 2'
    )

    args = parser.parse_args()

    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'
    if args.verbose:
        print('Loading embeddings data...')

    emb1file = open(args.emb1,
                    encoding=args.encoding,
                    errors='surrogateescape')
    emb2file = open(args.emb2,
                    encoding=args.encoding,
                    errors='surrogateescape')
    emb1_words, x = embeddings.read(emb1file, max_voc=0, dtype=dtype)
    emb2_words, z = embeddings.read(emb2file, max_voc=0, dtype=dtype)

    if len(args.normalize) > 0:
        x = normalize_emb(x, args.normalize[0])
        z = normalize_emb(z, args.normalize[1])

    emb1 = Embedding(emb1_words, x)
    emb2 = Embedding(emb2_words, z)

    if args.method[0] == "avg":
        meta_emb = avg(emb1, emb2)
    elif args.method[0] == "conc":
        meta_emb = concatenate(emb1, emb2)

    del emb1, emb2
    gc.collect()

    meta_emb_words = []
    meta_emb_vecs = []
    for w, v in meta_emb.word_vec_map.items():
        meta_emb_words += [w]
        meta_emb_vecs += [v]

    del meta_emb
    gc.collect()

    out_emb_fname = os.path.join(args.meta_embeddings_path, 'meta_emb.vec')
    with open(out_emb_fname, 'w', encoding=args.encoding) as outfile:
        embeddings.write(meta_emb_words, meta_emb_vecs, outfile)
Ejemplo n.º 24
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map the source embeddings into the target embedding space'
    )
    parser.add_argument('emb_file', help='the input target embeddings')
    parser.add_argument(
        '--lang_list',
        default='',
        help=
        'the list of languages listed in the same order as in the input embedding `emb_file` (comma-separated). e.g. "en,es,fr"'
    )
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--model_path',
                        default=None,
                        type=str,
                        help='directory to save the model')
    parser.add_argument(
        '--geomm_embeddings_path',
        default=None,
        type=str,
        help=
        'directory to save the output GeoMM Multi latent space embeddings. The output embeddings are normalized.'
    )

    parser.add_argument(
        '--max_vocab',
        default=0,
        type=int,
        help='Maximum vocabulary to be loaded, 0 allows complete vocabulary')
    parser.add_argument('--verbose', default=0, type=int, help='Verbose')

    mapping_group = parser.add_argument_group(
        'mapping arguments', 'Basic embedding mapping arguments')
    mapping_group.add_argument(
        '-dtrain_file',
        '--dictionary_train_file',
        default=sys.stdin.fileno(),
        help='the training dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '-dtest_file',
        '--dictionary_test_file',
        default=sys.stdin.fileno(),
        help='the test dictionary file (defaults to stdin)')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')

    geomm_group = parser.add_argument_group(
        'GeoMM Multi arguments', 'Arguments for GeoMM Multi method')
    geomm_group.add_argument('--l2_reg',
                             type=float,
                             default=1e3,
                             help='Lambda for L2 Regularization')
    geomm_group.add_argument(
        '--max_opt_time',
        type=int,
        default=5000,
        help='Maximum time limit for optimization in seconds')
    geomm_group.add_argument(
        '--max_opt_iter',
        type=int,
        default=150,
        help='Maximum number of iterations for optimization')

    eval_group = parser.add_argument_group('evaluation arguments',
                                           'Arguments for evaluation')
    eval_group.add_argument('--normalize_eval',
                            action='store_true',
                            help='Normalize the embeddings at test time')
    eval_group.add_argument('--eval_batch_size',
                            type=int,
                            default=1000,
                            help='Batch size for evaluation')
    eval_group.add_argument('--csls_neighbourhood',
                            type=int,
                            default=10,
                            help='Neighbourhood size for CSLS')

    args = parser.parse_args()

    BATCH_SIZE = args.eval_batch_size
    lang_list = None

    ## Logging
    #method_name = os.path.join('logs','geomm_multi')
    #directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    #if not os.path.exists(directory):
    #    os.makedirs(directory)
    #log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train_file))
    #log_file_name = log_file_name + '.log'
    #class Logger(object):
    #    def __init__(self):
    #        self.terminal = sys.stdout
    #        self.log = open(os.path.join(directory,log_file_name), "a")

    #    def write(self, message):
    #        self.terminal.write(message)
    #        self.log.write(message)

    #    def flush(self):
    #        #this flush method is needed for python 3 compatibility.
    #        #this handles the flush command by doing nothing.
    #        #you might want to specify some extra behavior here.
    #        pass
    #sys.stdout = Logger()
    if args.verbose:
        print('Current arguments: {0}'.format(args))

    dtype = 'float32'

    if args.verbose:
        print('Loading train data...')
    words = []
    emb = []
    with open(args.emb_file, encoding=args.encoding,
              errors='surrogateescape') as f:
        for line in f:
            srcfile = open(line.strip(),
                           encoding=args.encoding,
                           errors='surrogateescape')
            words_temp, x_temp = embeddings.read(srcfile,
                                                 max_voc=args.max_vocab,
                                                 dtype=dtype)
            words.append(words_temp)
            emb.append(x_temp)

    # Build word to index map
    word2ind = []
    for lang in words:
        word2ind.append({word: i for i, word in enumerate(lang)})

    ##### Set language names

    ## language id map
    if args.lang_list == '':
        lang_list = [str(i) for i in range(len(emb))]
    else:
        lang_list = args.lang_list.split(',')

    # Build training dictionary
    train_pairs = []
    with open(args.dictionary_train_file,
              encoding=args.encoding,
              errors='surrogateescape') as ff:
        for line in ff:
            vals = line.split(',')
            curr_dict = [int(vals[0].strip()), int(vals[1].strip())]
            src_indices = []
            trg_indices = []
            with open(vals[2].strip(),
                      encoding=args.encoding,
                      errors='surrogateescape') as f:
                for line in f:
                    src, trg = line.split()
                    if args.max_vocab:
                        src = src.lower()
                        trg = trg.lower()
                    try:
                        src_ind = word2ind[curr_dict[0]][src]
                        trg_ind = word2ind[curr_dict[1]][trg]
                        src_indices.append(src_ind)
                        trg_indices.append(trg_ind)
                    except KeyError:
                        if args.verbose:
                            print('WARNING: OOV dictionary entry ({0} - {1})'.
                                  format(src, trg),
                                  file=sys.stderr)
            curr_dict.append(src_indices)
            curr_dict.append(trg_indices)
            train_pairs.append(curr_dict)
    if args.verbose:
        print('Normalizing embeddings...')
    # Step 0: Normalization
    for action in args.normalize:
        if action == 'unit':
            for i in range(len(emb)):
                emb[i] = embeddings.length_normalize(emb[i])
        elif action == 'center':
            for i in range(len(emb)):
                emb[i] = embeddings.mean_center(emb[i])
        elif action == 'unitdim':
            for i in range(len(emb)):
                emb[i] = embeddings.length_normalize_dimensionwise(emb[i])
        elif action == 'centeremb':
            for i in range(len(emb)):
                emb[i] = embeddings.mean_center_embeddingwise(emb[i])

    # Step 1: Optimization
    if args.verbose:
        print('Beginning Optimization')
    start_time = time.time()
    mean_size = 0
    for tp in range(len(train_pairs)):
        src_indices = train_pairs[tp][2]
        trg_indices = train_pairs[tp][3]
        x_count = len(set(src_indices))
        z_count = len(set(trg_indices))
        A = np.zeros((x_count, z_count))

        # Creating dictionary matrix from training set
        map_dict_src = {}
        map_dict_trg = {}
        I = 0
        uniq_src = []
        uniq_trg = []
        for i in range(len(src_indices)):
            if src_indices[i] not in map_dict_src.keys():
                map_dict_src[src_indices[i]] = I
                I += 1
                uniq_src.append(src_indices[i])
        J = 0
        for j in range(len(trg_indices)):
            if trg_indices[j] not in map_dict_trg.keys():
                map_dict_trg[trg_indices[j]] = J
                J += 1
                uniq_trg.append(trg_indices[j])

        for i in range(len(src_indices)):
            A[map_dict_src[src_indices[i]], map_dict_trg[trg_indices[i]]] = 1
        train_pairs[tp].append(uniq_src)
        train_pairs[tp].append(uniq_trg)
        train_pairs[tp].append(A)
        mean_size += (len(uniq_src) * len(uniq_trg))
    mean_size = mean_size / len(train_pairs)
    np.random.seed(0)
    Lambda = args.l2_reg

    variables = []
    manif = []
    low_rank = emb[0].shape[1]
    for i in range(len(emb)):
        variables.append(TT.matrix())
        manif.append(Stiefel(emb[i].shape[1], low_rank))
    variables.append(TT.matrix())
    manif.append(PositiveDefinite(low_rank))
    B = variables[-1]
    cost = 0.5 * Lambda * (TT.sum(B**2))
    for i in range(len(train_pairs)):
        x = emb[train_pairs[i][0]]
        z = emb[train_pairs[i][1]]
        U1 = variables[train_pairs[i][0]]
        U2 = variables[train_pairs[i][1]]
        cost = cost + TT.sum(
            ((shared(x[train_pairs[i][4]]).dot(U1.dot(B.dot(U2.T)))).dot(
                shared(z[train_pairs[i][5]]).T) - shared(train_pairs[i][6]))**
            2) / float(len(train_pairs[i][2]))
    solver = ConjugateGradient(maxtime=args.max_opt_time,
                               maxiter=args.max_opt_iter,
                               mingradnorm=1e-12)
    manifold = Product(manif)
    problem = Problem(manifold=manifold, cost=cost, arg=variables, verbosity=3)
    wopt = solver.solve(problem)
    w = wopt
    ### Save the models if requested
    if args.model_path is not None:
        os.makedirs(args.model_path, exist_ok=True)

        for i in range(len(emb)):
            np.savetxt('{0}/U_{1}.csv'.format(args.model_path, lang_list[i]),
                       wopt[i])

        np.savetxt('{}/B.csv'.format(args.model_path), wopt[-1])

        #with open('{}/lang_id_map.txt'.format(args.model_path),'w',encoding='utf-8') as idmapfile:
        #    for lang in lang_list:
        #        idmapfile.write(lang+'\n')

    # Step 2: Transformation
    Bhalf = scipy.linalg.sqrtm(wopt[-1])
    test_emb = []
    for i in range(len(emb)):
        test_emb.append(emb[i].dot(wopt[i]).dot(Bhalf))

    end_time = time.time()
    if args.verbose:
        print('Completed training in {0:.2f} seconds'.format(end_time -
                                                             start_time))
    gc.collect()

    ### Save the GeoMM embeddings if requested
    if args.geomm_embeddings_path is not None:
        os.makedirs(args.geomm_embeddings_path, exist_ok=True)
        for i in range(len(test_emb)):
            out_emb_fname = os.path.join(args.geomm_embeddings_path,
                                         'emb_{0}.vec'.format(lang_list[i]))
            with open(out_emb_fname, 'w', encoding=args.encoding) as outfile:
                embeddings.write(words[i],
                                 embeddings.length_normalize(test_emb[i]),
                                 outfile)

    # Step 3: Evaluation
    if args.verbose:
        print('Beginning Evaluation')

    if args.normalize_eval:
        for i in range(len(test_emb)):
            test_emb[i] = embeddings.length_normalize(test_emb[i])

    # Loading test dictionary
    with open(args.dictionary_test_file,
              encoding=args.encoding,
              errors='surrogateescape') as ff:
        for line in ff:
            vals = line.split(',')
            curr_dict = [int(vals[0].strip()), int(vals[1].strip())]
            with open(vals[2].strip(),
                      encoding=args.encoding,
                      errors='surrogateescape') as f:
                src_word2ind = word2ind[curr_dict[0]]
                trg_word2ind = word2ind[curr_dict[1]]
                xw = test_emb[curr_dict[0]]
                zw = test_emb[curr_dict[1]]
                src2trg = collections.defaultdict(set)
                trg2src = collections.defaultdict(set)
                oov = set()
                vocab = set()
                for line in f:
                    src, trg = line.split()
                    if args.max_vocab:
                        src = src.lower()
                        trg = trg.lower()
                    try:
                        src_ind = src_word2ind[src]
                        trg_ind = trg_word2ind[trg]
                        src2trg[src_ind].add(trg_ind)
                        trg2src[trg_ind].add(src_ind)
                        vocab.add(src)
                    except KeyError:
                        oov.add(src)
                src = list(src2trg.keys())
                trgt = list(trg2src.keys())

                oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
                coverage = len(src2trg) / (len(src2trg) + len(oov))
                f.close()

                translation = collections.defaultdict(int)
                translation5 = collections.defaultdict(list)
                translation10 = collections.defaultdict(list)

                t = time.time()
                nbrhood_x = np.zeros(xw.shape[0])
                nbrhood_z = np.zeros(zw.shape[0])
                nbrhood_z2 = cp.zeros(zw.shape[0])
                for i in range(0, len(src), BATCH_SIZE):
                    j = min(i + BATCH_SIZE, len(src))
                    similarities = xw[src[i:j]].dot(zw.T)
                    similarities_x = -1 * np.partition(
                        -1 * similarities, args.csls_neighbourhood - 1, axis=1)
                    nbrhood_x[src[i:j]] = np.mean(
                        similarities_x[:, :args.csls_neighbourhood], axis=1)

                batch_num = 1
                with cp.cuda.Device(1):
                    for i in range(0, zw.shape[0], BATCH_SIZE):
                        j = min(i + BATCH_SIZE, zw.shape[0])
                        similarities = -1 * cp.partition(
                            -1 * cp.dot(cp.asarray(zw[i:j]),
                                        cp.transpose(cp.asarray(xw))),
                            args.csls_neighbourhood - 1,
                            axis=1)[:, :args.csls_neighbourhood]
                        nbrhood_z2[i:j] = (cp.mean(
                            similarities[:, :args.csls_neighbourhood], axis=1))
                        batch_num += 1
                    nbrhood_z = cp.asnumpy(nbrhood_z2)

                for i in range(0, len(src), BATCH_SIZE):
                    j = min(i + BATCH_SIZE, len(src))
                    similarities = xw[src[i:j]].dot(zw.T)
                    similarities = np.transpose(
                        np.transpose(2 * similarities) -
                        nbrhood_x[src[i:j]]) - nbrhood_z
                    nn = similarities.argmax(axis=1).tolist()
                    similarities = np.argsort((similarities), axis=1)

                    nn5 = (similarities[:, -5:])
                    nn10 = (similarities[:, -10:])
                    for k in range(j - i):
                        translation[src[i + k]] = nn[k]
                        translation5[src[i + k]] = nn5[k]
                        translation10[src[i + k]] = nn10[k]
                accuracy = np.mean(
                    [1 if translation[i] in src2trg[i] else 0 for i in src])
                mean = 0
                for i in src:
                    for k in translation5[i]:
                        if k in src2trg[i]:
                            mean += 1
                            break

                mean /= len(src)
                accuracy5 = mean

                mean = 0
                for i in src:
                    for k in translation10[i]:
                        if k in src2trg[i]:
                            mean += 1
                            break

                mean /= len(src)
                accuracy10 = mean
                print(
                    'Coverage:{0:7.2%}  Accuracy:{1:7.2%}  Accuracy(Top 5):{2:7.2%}  Accuracy(Top 10):{3:7.2%}'
                    .format(coverage, accuracy, accuracy5, accuracy10))
Ejemplo n.º 25
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Map word embeddings in two languages into a shared space')
    parser.add_argument('src_input', help='the input source embeddings')
    parser.add_argument('trg_input', help='the input target embeddings')
    parser.add_argument('src_output', help='the output source embeddings')
    parser.add_argument('trg_output', help='the output target embeddings')
    parser.add_argument(
        '--encoding',
        default='utf-8',
        help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('--precision',
                        choices=['fp16', 'fp32', 'fp64'],
                        default='fp32',
                        help='the floating-point precision (defaults to fp32)')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='use cuda (requires cupy)')
    parser.add_argument(
        '--batch_size',
        default=10000,
        type=int,
        help=
        'batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory'
    )
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='the random seed (defaults to 0)')
    parser.add_argument('--maxiter',
                        type=int,
                        default=10,
                        help='max number of iterations')
    parser.add_argument('--corekbest',
                        type=int,
                        default=2,
                        help='nn ranking to be considered as a match')
    parser.add_argument('--decayrate',
                        type=float,
                        default=1.01,
                        help='for boosting')
    parser.add_argument('--init_vocab',
                        type=int,
                        default=10000,
                        help='for boosting')
    parser.add_argument('--dictname',
                        default='dict.tmp',
                        help='output the dictionary')

    recommended_type = parser.add_argument_group(
        'recommended settings', 'Recommended settings for different scenarios')
    recommended_type.add_argument(
        '--supervised',
        metavar='DICTIONARY',
        help='recommended if you have a large training dictionary')
    recommended_type.add_argument(
        '--identical',
        default=True,
        help=
        'recommended if you have no seed dictionary but can rely on identical words'
    )

    init_group = parser.add_argument_group(
        'advanced initialization arguments',
        'Advanced initialization arguments')
    init_type = init_group.add_mutually_exclusive_group()
    init_type.add_argument(
        '-d',
        '--init_dictionary',
        default=sys.stdin.fileno(),
        metavar='DICTIONARY',
        help='the training dictionary file (defaults to stdin)')
    init_type.add_argument('--init_identical',
                           action='store_true',
                           help='use identical words as the seed dictionary')
    init_type.add_argument(
        '--init_numerals',
        action='store_true',
        help=
        'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary'
    )
    init_type.add_argument('--init_unsupervised',
                           action='store_true',
                           help='use unsupervised initialization')
    init_group.add_argument(
        '--unsupervised_vocab',
        type=int,
        default=0,
        help=
        'restrict the vocabulary to the top k entries for unsupervised initialization'
    )

    mapping_group = parser.add_argument_group(
        'advanced mapping arguments', 'Advanced embedding mapping arguments')
    mapping_group.add_argument(
        '--normalize',
        choices=['unit', 'center', 'unitdim', 'centeremb', 'none'],
        nargs='*',
        default=[],
        help='the normalization actions to perform in order')
    mapping_group.add_argument('--vocabulary', help='restrict source vocab')
    mapping_type = mapping_group.add_mutually_exclusive_group()
    mapping_type.add_argument('-c',
                              '--orthogonal',
                              action='store_true',
                              help='use orthogonal constrained mapping')
    mapping_type.add_argument('-u',
                              '--unconstrained',
                              action='store_true',
                              help='use unconstrained mapping')

    self_learning_group = parser.add_argument_group(
        'advanced self-learning arguments',
        'Advanced arguments for self-learning')
    self_learning_group.add_argument(
        '--vocabulary_cutoff',
        type=int,
        default=0,
        help='restrict the vocabulary to the top k entries')
    self_learning_group.add_argument('--csls',
                                     type=int,
                                     nargs='?',
                                     default=0,
                                     const=10,
                                     metavar='NEIGHBORHOOD_SIZE',
                                     dest='csls_neighborhood',
                                     help='use CSLS for dictionary induction')
    self_learning_group.add_argument(
        '--validation',
        default=None,
        metavar='DICTIONARY',
        help='a dictionary file for validation at each iteration')
    self_learning_group.add_argument(
        '--log', help='write to a log file in tsv format at each iteration')
    self_learning_group.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='write log information to stderr at each iteration')
    args = parser.parse_args()

    parser.set_defaults(init_dictionary=args.supervised,
                        normalize=['unit', 'center', 'unit'])
    args = parser.parse_args()
    print(args, file=sys.stderr)

    # Choose the right dtype for the desired precision
    if args.precision == 'fp16':
        dtype = 'float16'
    elif args.precision == 'fp32':
        dtype = 'float32'
    elif args.precision == 'fp64':
        dtype = 'float64'

    os.makedirs(OUTPUTDIR, exist_ok=True)

    # Read input embeddings
    vocabulary = None
    if args.vocabulary is not None:
        vocabulary = set()
        with open(args.vocabulary,
                  encoding=args.encoding,
                  errors='surrogateescape') as file:
            for l in file:
                vocabulary.add(l.split()[0])
        print(f'vocab size:\t{len(vocabulary)}')

    with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \
            open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile:
        src_words, x = embeddings.read(srcfile,
                                       dtype=dtype,
                                       threshold=args.vocabulary_cutoff,
                                       vocabulary=vocabulary)
        trg_words, z = embeddings.read(trgfile,
                                       dtype=dtype,
                                       threshold=args.vocabulary_cutoff)
        embeddings.normalize(x, args.normalize)
        embeddings.normalize(z, args.normalize)
    # NumPy/CuPy management
    if args.cuda:
        if not supports_cupy():
            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
            sys.exit(-1)
        xp = get_cupy()
        x = xp.asarray(x)
        z = xp.asarray(z)
    else:
        xp = np
    xp.random.seed(args.seed)

    # Build word to index map
    src_word2ind = {word: i for i, word in enumerate(src_words)}
    trg_word2ind = {word: i for i, word in enumerate(trg_words)}

    # Build the seed dictionary
    src_indices = []
    trg_indices = []

    if args.supervised:
        f = open(args.init_dictionary,
                 encoding=args.encoding,
                 errors='surrogateescape')
        for line in f:
            try:
                src, trg = line.split()[:2]
            except ValueError:
                continue
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                src_indices.append(src_ind)
                trg_indices.append(trg_ind)
            except KeyError:
                print('WARNING: OOV dictionary entry ({0} - {1})'.format(
                    src, trg),
                      file=sys.stderr)

    # Read validation dictionary
    if args.validation is not None:
        print('reading validation', file=sys.stderr)
        f = open(args.validation,
                 encoding=args.encoding,
                 errors='surrogateescape')
        validation = collections.defaultdict(set)
        oov = set()
        vocab = set()
        for line in f:
            try:
                src, trg = line.split()
            except ValueError:
                continue
            try:
                src_ind = src_word2ind[src]
                trg_ind = trg_word2ind[trg]
                validation[src_ind].add(trg_ind)
                vocab.add(src)
            except KeyError:
                oov.add(src)
        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
        validation_coverage = len(validation) / (len(validation) + len(oov))

    # Create log file
    if args.log:
        log = open(args.log,
                   mode='w',
                   encoding=args.encoding,
                   errors='surrogateescape')

    # Allocate memory
    xw = xp.empty_like(x)
    zw = xp.empty_like(z)

    matches = collections.Counter()
    decided = collections.Counter()
    cum_weights = collections.Counter(matches)
    score = collections.Counter()
    for p in zip(src_indices, trg_indices):
        matches[p] = 1
        decided[p] = 1
    identical = set(src_words).intersection(set(trg_words))
    for word in list(identical):
        p = (src_word2ind[word], trg_word2ind[word])
        matches[p] = 1
        decided[p] = 1

    if args.validation is not None:
        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)

    # Training loop
    it = 1
    t = time.time()
    wprev = 0
    current_vocab = args.init_vocab
    Stats = collections.namedtuple(
        'MatchStats',
        ['w_dot', 'mean_dot', 'delta_w', 'current_vocab', 'len_match'])
    pstats = None
    stats = None
    while True:
        src_indices, trg_indices, weights = flatten_match(matches, matches)
        # x, z = np.array(x0), np.array(z0)

        embeddings.noise(x)
        embeddings.noise(z)

        if args.unconstrained:
            w = np.linalg.lstsq(np.sqrt(weights) * x[src_indices],
                                np.sqrt(weights) * z[trg_indices],
                                rcond=None)[0]
            # w = np.linalg.lstsq(x[src_indices], z[trg_indices], rcond=None)[0]
            x.dot(w, out=xw)
            zw = z[:]
        else:
            u, s, vt = xp.linalg.svd(
                (weights * z[trg_indices]).T.dot(x[src_indices]))
            # u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
            w = vt.T.dot(u.T)
            x.dot(w, out=xw)
            zw = z[:]
            w_dot = np.sum(
                weights * z[trg_indices] * xw[src_indices]) / weights.sum()
            mean_dot = np.sum(
                z[trg_indices] * xw[src_indices]) / len(src_indices)
            delta_w = np.linalg.norm(w - wprev)
            stats = Stats(w_dot=w_dot,
                          mean_dot=mean_dot,
                          delta_w=delta_w,
                          current_vocab=current_vocab,
                          len_match=len(src_indices))

        if it > 1 and stats.w_dot < pstats.w_dot:
            current_vocab = min(int(current_vocab * 1.1),
                                args.vocabulary_cutoff)

        T = 1 * np.exp((it - 1) * np.log(1e-2) / (args.maxiter))
        # T = 1
        score = collections.Counter()
        cum_weights = collections.Counter()
        matches, objective = find_matches(xw,
                                          zw,
                                          cum_weights,
                                          score,
                                          ul=current_vocab,
                                          T=T,
                                          kbest=args.corekbest,
                                          csls=args.csls_neighborhood,
                                          decay=args.decayrate)

        for m in decided:
            decided[m] = decided[m] * (1 - 1 / it)

        for m in score:
            if m in score:
                eta = 1 / it
            else:
                eta = max(0.5, 1 / it)
            decided[m] = decided[m] * (1 - eta) + score[m] * eta

        # Accuracy and similarity evaluation in validation
        if args.validation is not None:
            src = list(validation.keys())
            xw[src].dot(zw.T, out=simval)
            nn = asnumpy(simval.argmax(axis=1))
            accuracy = np.mean([
                1 if nn[i] in validation[src[i]] else 0
                for i in range(len(src))
            ])
            similarity = np.mean([
                np.max([simval[i, j].tolist() for j in validation[src[i]]])
                for i in range(len(src))
            ])

        with open(f'{OUTPUTDIR}/{args.dictname}.{it}', mode='w') as f:
            for p in decided.most_common():
                si, ti = p[0]
                print(f'{src_words[si]}\t{trg_words[ti]}\t{p[1]:.3e}', file=f)

        # Logging
        duration = time.time() - t

        if args.verbose:
            print(file=sys.stderr)
            print('ITERATION {0} ({1:.2f}s)'.format(it, duration),
                  file=sys.stderr)
            print('\t- Objective:        {0:9.4f}%'.format(100 * objective),
                  file=sys.stderr)
            print(
                f'\t- #match/#decided:             {len(src_indices)}/{len(decided)}',
                file=sys.stderr)
            print(stats, file=sys.stderr)
            if args.validation is not None:
                print('\t- Val. similarity:  {0:9.4f}%'.format(100 *
                                                               similarity),
                      file=sys.stderr)
                print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy),
                      file=sys.stderr)
                print('\t- Val. coverage:    {0:9.4f}%'.format(
                    100 * validation_coverage),
                      file=sys.stderr)
            sys.stderr.flush()
        if args.log is not None:
            val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
                100 * similarity, 100 * accuracy, 100 *
                validation_coverage) if args.validation is not None else ''
            print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val,
                                                      duration),
                  file=log)
            log.flush()

        if it >= args.maxiter:
            break
        t = time.time()
        wprev = w
        pstats = stats
        it += 1

    # write mapped embeddings
    print('**** reading and writing final embeddings ****', file=sys.stderr)
    with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \
            open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile:
        src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=100000)
        trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=100000)

    embeddings.normalize(x, args.normalize)
    embeddings.normalize(z, args.normalize)

    with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile, \
            open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile:
        embeddings.write(src_words, x.dot(w), srcfile)
        embeddings.write(trg_words, z, trgfile)