def train_translation_matrix(source_file, target_file, dict_file, out_file):
    """Trains a transltion matrix between the source and target languages, using the words in dict_file as anchor
    points and writing the translation matrix to out_file

    Note that the source language file and target language file must be in the word2vec C ASCII format

    :param source_file: The name of the source language file
    :param target_file: The name of the target language file
    :param dict_file: The name of the file with the bilingual dictionary
    :param out_file: The name of the file to write the translation matrix to
    """
    log.info("Reading the training data")
    train_data = read_dict(dict_file)

    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words, target_words = zip(*train_data)

    log.info("Reading: %s" % source_file)
    source_sp = Space.build(source_file, set(source_words))
    source_sp.normalize()

    log.info("Reading: %s" % target_file)
    target_sp = Space.build(target_file, set(target_words))
    target_sp.normalize()

    log.debug('Words in the source space: %s' % source_sp.row2id)
    log.debug('Words in the target space: %s' % target_sp.row2id)

    log.info("Learning the translation matrix")
    log.info("Training data: %s" % str(train_data))
    tm = train_tm(source_sp, target_sp, train_data)

    log.info("Printing the translation matrix")
    np.savetxt(out_file, tm)
Example #2
0
def train_wrapper(seed_fn, source_fn, target_fn, reverse=False, mx_path=None,
                  train_size=5000):
    logging.info("Training...")
    seed_trans = read_dict(seed_fn, reverse=reverse)

    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words = set(seed_trans.iterkeys())
    target_words = set().union(*seed_trans.itervalues())

    source_sp = Space.build(source_fn, lexicon=source_words)
    source_sp.normalize()

    target_sp = Space.build(target_fn, lexicon=target_words)
    target_sp.normalize()

    logging.info("Learning the translation matrix")
    tm, used_for_train = train_tm(source_sp, target_sp, seed_trans, train_size)

    mx_path = default_output_fn(mx_path, seed_fn, source_fn, target_fn,)
    logging.info("Saving the translation matrix to {}".format(mx_path))
    np.save('{}.npy'.format(mx_path), tm)
    pickle.dump(used_for_train, open('{}.train_wds'.format(mx_path),
                                     mode='w'))

    return tm, used_for_train
Example #3
0
def train_translation_matrix(source_file, target_file, dict_file, out_file):
    """Trains a transltion matrix between the source and target languages, using the words in dict_file as anchor
    points and writing the translation matrix to out_file

    Note that the source language file and target language file must be in the word2vec C ASCII format

    :param source_file: The name of the source language file
    :param target_file: The name of the target language file
    :param dict_file: The name of the file with the bilingual dictionary
    :param out_file: The name of the file to write the translation matrix to
    """
    log.info("Reading the training data")
    train_data = read_dict(dict_file)

    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words, target_words = zip(*train_data)

    log.info("Reading: %s" % source_file)
    source_sp = Space.build(source_file, set(source_words))
    source_sp.normalize()

    log.info("Reading: %s" % target_file)
    target_sp = Space.build(target_file, set(target_words))
    target_sp.normalize()

    log.debug('Words in the source space: %s' % source_sp.row2id)
    log.debug('Words in the target space: %s' % target_sp.row2id)

    log.info("Learning the translation matrix")
    log.info("Training data: %s" % str(train_data))
    tm = train_tm(source_sp, target_sp, train_data)

    log.info("Printing the translation matrix")
    np.savetxt(out_file, tm)
Example #4
0
def main(sys_argv):

    try:
        opts, argv = getopt.getopt(sys_argv[1:], "ho:", ["help", "output="])
    except getopt.GetoptError as err:
        print(str(err))
        usage()
        sys.exit(1)

    out_file = "./tm"
    for opt, val in opts:
        if opt in ("-o", "--output"):
            out_file = val
        elif opt in ("-h", "--help"):
            usage(0)
        else:
            usage(1)

    if len(argv) == 3:
        source_file = argv[1]
        target_file = argv[2]
        dict_file = argv[0]
    else:
        print(str(err))
        usage(1)

    print("Reading the training data")
    train_data = read_dict(dict_file)

    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words, target_words = zip(*train_data)

    print("Reading: %s" % source_file)
    source_sp = Space.build(source_file, set(source_words))
    source_sp.normalize()

    print("Reading: %s" % target_file)
    target_sp = Space.build(target_file, set(target_words))
    target_sp.normalize()

    print("Learning the translation matrix")
    tm = train_tm(source_sp, target_sp, train_data)

    print("Printing the translation matrix")
    np.savetxt("%s.txt" % out_file, tm)
Example #5
0
        target_file = argv[2]
        dict_file = argv[0]
    else:
        print str(err)
        usage(1)

    print "Reading the training data"
    train_data = read_dict(dict_file)
    print train_data
    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words, target_words = zip(*train_data)

    print "Reading: %s" % source_file
    source_sp = Space.build(source_file, set(source_words))
    source_sp.normalize()

    print "Reading: %s" % target_file
    target_sp = Space.build(target_file, set(target_words))
    target_sp.normalize()

    print "Learning the translation matrix"
    tm = train_tm(source_sp, target_sp, train_data)

    print "Printing the translation matrix"
    np.savetxt("%s.txt" % out_file, tm)


if __name__ == '__main__':
    main(sys.argv)
Example #6
0
	print str(err)
	usage(1)


    print "Reading the training data"
    train_data = read_dict(dict_file)

    #we only need to load the vectors for the words in the training data
    #semantic spaces contain additional words
    source_words, target_words = zip(*train_data)

    print "Reading: %s" % source_file
    source_sp = Space.build(source_file, set(source_words))
    source_sp.normalize()

    print "Reading: %s" % target_file
    target_sp = Space.build(target_file, set(target_words))
    target_sp.normalize()

    print "Learning the translation matrix"
    print "Training data: %s" % str(train_data)
    tm = train_tm(source_sp, target_sp, train_data)

    print "Printing the translation matrix"
    np.savetxt("%s.txt" % out_file, tm)


if __name__ == '__main__':
    main(sys.argv)