def translate_language(source_file, target_file, translation_matrix_file, output_file, num_translations):
    """Translates the language in the source file with the matrix in the translation_matrix_file, then searches the
    language in the target file for the closest words to each word in the source language. The translations are
    written one word per line to the output_file. The format of each line is as follows:

    [source word]: [target word 1] ([target word 1 probability]), [target word 2] ([target word 2 probability]) ...

    The source word is the first word on each line. The soruce word is followed by a colon. Then comes the first target
    word, followed by its probability in parenthesis, then the second possible translation, followed by its probability
    in parenthesis, and so on for however many possible translations you generate for each word.

    The probability of each translation is given by the cosing distance between the source word and the possible
    translation
    """
    
    log.info('Reading in source language')
    source_sp = Space.build(source_file)

    log.info('Reading in the target language')
    target_sp = Space.build(target_file)

    log.info('Reading in translation matrix')
    tm = np.loadtxt(translation_matrix_file)

    source_sp.normalize()
    target_sp.normalize()

    log.info('Transforming into target space')
    mapped_source_sp = apply_tm(source_sp, tm)

    gold = collections.defaultdict(set)
    count = 0

    # Go through all the words in the space's index, getting their closest x equivalents from the target space
    for word, idx in mapped_source_sp.row2id.iteritems():
        log.debug('Translating word %s' % word)
        word_embedding = mapped_source_sp.mat[idx]
        
        closest_words = target_sp.get_closest_words(word_embedding, num_translations)
        
        gold[word] = closest_words
        log.debug('Possible translations: %s' % closest_words)

        count += 1
        if count % 500 == 0:
            log.debug('Translated %d words' % count)

    log.info('Translated all words into the target language')

    with open(output_file, 'w') as f:
        f.write('\n'.join(gold))
def run_spanish_test():
    """Runs the translation algorithm on the Spanish corpus, then evaluates the translations

    A lot of the comments and variable names in this function are inaccurate. They say 'thread' when they should say
    'process'. Any future maintainers should tkae heed of this and shouldn't actually expect threads
    """
    # __train_model('../corpa/spanish/evaluation-model.w2v')
    # log.info('Generated Spanish model')

    spanish_space = Space.build('../corpa/spanish/model-ascii.w2v',
                                total_count=7000,
                                only_lower=True)
    english_space = Space.build('../corpa/english/model-ascii.w2v')

    spanish_space.normalize()
    english_space.normalize()

    spanish_vocab = spanish_space.row2id.keys()
    english_vocab = english_space.row2id.keys()
    log.info('Read in Spanish and English vocabularies')

    anchor_words = __get_anchor_words(spanish_vocab, english_vocab, 1000)
    log.info('Generated anchor words')

    tm = train_tm(spanish_space, english_space, anchor_words)
    log.info('Generated translation matrix')

    translated_spanish_space = apply_tm(spanish_space, tm)
    log.info('Translated Spanish word vectors')

    # cores * 2 threads takes about 0:01:18 per word. That's about 0.2 words per second
    # cores threads takes about 0:01:00 per word. That's about 0.1333 words per second
    # cores / 2 threads takes about 30 seconds per word. Maybe a little less. That's about equal to the above
    num_threads = multiprocessing.cpu_count()
    thread_results = list(
    )  # List to hold all the results that each thread comes up with
    threads = list()

    spanish_words = list()
    for word, idx in translated_spanish_space.row2id.iteritems():
        spanish_words.append((word, translated_spanish_space.mat[idx]))

    step = 7000 / num_threads
    step = 10  # Use a small amount of data to test things
    # Start a bunch of threads to handle translating different parts of the Spanish space
    shuffle(spanish_words)
    for i in range(num_threads):
        single_gold = multiprocessing.Queue()
        thread_results.append(single_gold)

        thread = multiprocessing.Process(
            target=run,
            args=(spanish_words[i * step:(i + 1) * step], english_space,
                  single_gold))
        thread.start()
        threads.append(thread)

    log.info('Started %d threads; each one is responsible for %d words' %
             (num_threads, step))

    # Wait until all threads finish
    for thread in threads:
        thread.join()

    # Acquire ALL the translations!

    gold = dict()

    for single_gold in thread_results:
        while not single_gold.empty():
            word, translations = single_gold.get()
            gold[word] = translations

    log.info('Found English translations')

    log.info('Loading English word2vec model')
    english_model = Word2Vec.load_word2vec_format(
        '../corpa/english/GoogleNews-vectors-negative300.bin', binary=True)
    log.info('English model loaded')
    evaluations = __evaluate_translation(gold, english_model)
    log.info('Evaluated translations')

    # What data do I want? I want to know the most likely translation for each word, and I want to print those to a
    # csv file or something
    __print_most_likely_translations(evaluations, gold)
    log.info('Wrote translations to a file')
Esempio n. 3
0
def run_spanish_test():
    """Runs the translation algorithm on the Spanish corpus, then evaluates the translations

    A lot of the comments and variable names in this function are inaccurate. They say 'thread' when they should say
    'process'. Any future maintainers should tkae heed of this and shouldn't actually expect threads
    """
    # __train_model('../corpa/spanish/evaluation-model.w2v')
    # log.info('Generated Spanish model')

    spanish_space = Space.build('../corpa/spanish/model-ascii.w2v', total_count=7000, only_lower=True)
    english_space = Space.build('../corpa/english/model-ascii.w2v')

    spanish_space.normalize()
    english_space.normalize()

    spanish_vocab = spanish_space.row2id.keys()
    english_vocab = english_space.row2id.keys()
    log.info('Read in Spanish and English vocabularies')

    anchor_words = __get_anchor_words(spanish_vocab, english_vocab, 1000)
    log.info('Generated anchor words')

    tm = train_tm(spanish_space, english_space, anchor_words)
    log.info('Generated translation matrix')

    translated_spanish_space = apply_tm(spanish_space, tm)
    log.info('Translated Spanish word vectors')

    # cores * 2 threads takes about 0:01:18 per word. That's about 0.2 words per second
    # cores threads takes about 0:01:00 per word. That's about 0.1333 words per second
    # cores / 2 threads takes about 30 seconds per word. Maybe a little less. That's about equal to the above
    num_threads = multiprocessing.cpu_count()
    thread_results = list()     # List to hold all the results that each thread comes up with
    threads = list()

    spanish_words = list()
    for word, idx in translated_spanish_space.row2id.iteritems():
        spanish_words.append((word, translated_spanish_space.mat[idx]))

    step = 7000 / num_threads
    step = 10    # Use a small amount of data to test things
    # Start a bunch of threads to handle translating different parts of the Spanish space
    shuffle(spanish_words)
    for i in range(num_threads):
        single_gold = multiprocessing.Queue()
        thread_results.append(single_gold)

        thread = multiprocessing.Process(target=run, args=(spanish_words[i * step:(i + 1) * step], english_space, single_gold))
        thread.start() 
        threads.append(thread)

    log.info('Started %d threads; each one is responsible for %d words' % (num_threads, step))

    # Wait until all threads finish
    for thread in threads:
        thread.join()

    # Acquire ALL the translations!

    gold = dict()

    for single_gold in thread_results:
        while not single_gold.empty():
            word, translations = single_gold.get()
            gold[word] = translations

    log.info('Found English translations')

    log.info('Loading English word2vec model')
    english_model = Word2Vec.load_word2vec_format('../corpa/english/GoogleNews-vectors-negative300.bin', binary=True)
    log.info('English model loaded')
    evaluations = __evaluate_translation(gold, english_model)
    log.info('Evaluated translations')

    # What data do I want? I want to know the most likely translation for each word, and I want to print those to a
    # csv file or something
    __print_most_likely_translations(evaluations, gold)
    log.info('Wrote translations to a file')