def translate_language(source_file, target_file, translation_matrix_file, output_file, num_translations): """Translates the language in the source file with the matrix in the translation_matrix_file, then searches the language in the target file for the closest words to each word in the source language. The translations are written one word per line to the output_file. The format of each line is as follows: [source word]: [target word 1] ([target word 1 probability]), [target word 2] ([target word 2 probability]) ... The source word is the first word on each line. The soruce word is followed by a colon. Then comes the first target word, followed by its probability in parenthesis, then the second possible translation, followed by its probability in parenthesis, and so on for however many possible translations you generate for each word. The probability of each translation is given by the cosing distance between the source word and the possible translation """ log.info('Reading in source language') source_sp = Space.build(source_file) log.info('Reading in the target language') target_sp = Space.build(target_file) log.info('Reading in translation matrix') tm = np.loadtxt(translation_matrix_file) source_sp.normalize() target_sp.normalize() log.info('Transforming into target space') mapped_source_sp = apply_tm(source_sp, tm) gold = collections.defaultdict(set) count = 0 # Go through all the words in the space's index, getting their closest x equivalents from the target space for word, idx in mapped_source_sp.row2id.iteritems(): log.debug('Translating word %s' % word) word_embedding = mapped_source_sp.mat[idx] closest_words = target_sp.get_closest_words(word_embedding, num_translations) gold[word] = closest_words log.debug('Possible translations: %s' % closest_words) count += 1 if count % 500 == 0: log.debug('Translated %d words' % count) log.info('Translated all words into the target language') with open(output_file, 'w') as f: f.write('\n'.join(gold))
def run_spanish_test(): """Runs the translation algorithm on the Spanish corpus, then evaluates the translations A lot of the comments and variable names in this function are inaccurate. They say 'thread' when they should say 'process'. Any future maintainers should tkae heed of this and shouldn't actually expect threads """ # __train_model('../corpa/spanish/evaluation-model.w2v') # log.info('Generated Spanish model') spanish_space = Space.build('../corpa/spanish/model-ascii.w2v', total_count=7000, only_lower=True) english_space = Space.build('../corpa/english/model-ascii.w2v') spanish_space.normalize() english_space.normalize() spanish_vocab = spanish_space.row2id.keys() english_vocab = english_space.row2id.keys() log.info('Read in Spanish and English vocabularies') anchor_words = __get_anchor_words(spanish_vocab, english_vocab, 1000) log.info('Generated anchor words') tm = train_tm(spanish_space, english_space, anchor_words) log.info('Generated translation matrix') translated_spanish_space = apply_tm(spanish_space, tm) log.info('Translated Spanish word vectors') # cores * 2 threads takes about 0:01:18 per word. That's about 0.2 words per second # cores threads takes about 0:01:00 per word. That's about 0.1333 words per second # cores / 2 threads takes about 30 seconds per word. Maybe a little less. That's about equal to the above num_threads = multiprocessing.cpu_count() thread_results = list( ) # List to hold all the results that each thread comes up with threads = list() spanish_words = list() for word, idx in translated_spanish_space.row2id.iteritems(): spanish_words.append((word, translated_spanish_space.mat[idx])) step = 7000 / num_threads step = 10 # Use a small amount of data to test things # Start a bunch of threads to handle translating different parts of the Spanish space shuffle(spanish_words) for i in range(num_threads): single_gold = multiprocessing.Queue() thread_results.append(single_gold) thread = multiprocessing.Process( target=run, args=(spanish_words[i * step:(i + 1) * step], english_space, single_gold)) thread.start() threads.append(thread) log.info('Started %d threads; each one is responsible for %d words' % (num_threads, step)) # Wait until all threads finish for thread in threads: thread.join() # Acquire ALL the translations! gold = dict() for single_gold in thread_results: while not single_gold.empty(): word, translations = single_gold.get() gold[word] = translations log.info('Found English translations') log.info('Loading English word2vec model') english_model = Word2Vec.load_word2vec_format( '../corpa/english/GoogleNews-vectors-negative300.bin', binary=True) log.info('English model loaded') evaluations = __evaluate_translation(gold, english_model) log.info('Evaluated translations') # What data do I want? I want to know the most likely translation for each word, and I want to print those to a # csv file or something __print_most_likely_translations(evaluations, gold) log.info('Wrote translations to a file')
def run_spanish_test(): """Runs the translation algorithm on the Spanish corpus, then evaluates the translations A lot of the comments and variable names in this function are inaccurate. They say 'thread' when they should say 'process'. Any future maintainers should tkae heed of this and shouldn't actually expect threads """ # __train_model('../corpa/spanish/evaluation-model.w2v') # log.info('Generated Spanish model') spanish_space = Space.build('../corpa/spanish/model-ascii.w2v', total_count=7000, only_lower=True) english_space = Space.build('../corpa/english/model-ascii.w2v') spanish_space.normalize() english_space.normalize() spanish_vocab = spanish_space.row2id.keys() english_vocab = english_space.row2id.keys() log.info('Read in Spanish and English vocabularies') anchor_words = __get_anchor_words(spanish_vocab, english_vocab, 1000) log.info('Generated anchor words') tm = train_tm(spanish_space, english_space, anchor_words) log.info('Generated translation matrix') translated_spanish_space = apply_tm(spanish_space, tm) log.info('Translated Spanish word vectors') # cores * 2 threads takes about 0:01:18 per word. That's about 0.2 words per second # cores threads takes about 0:01:00 per word. That's about 0.1333 words per second # cores / 2 threads takes about 30 seconds per word. Maybe a little less. That's about equal to the above num_threads = multiprocessing.cpu_count() thread_results = list() # List to hold all the results that each thread comes up with threads = list() spanish_words = list() for word, idx in translated_spanish_space.row2id.iteritems(): spanish_words.append((word, translated_spanish_space.mat[idx])) step = 7000 / num_threads step = 10 # Use a small amount of data to test things # Start a bunch of threads to handle translating different parts of the Spanish space shuffle(spanish_words) for i in range(num_threads): single_gold = multiprocessing.Queue() thread_results.append(single_gold) thread = multiprocessing.Process(target=run, args=(spanish_words[i * step:(i + 1) * step], english_space, single_gold)) thread.start() threads.append(thread) log.info('Started %d threads; each one is responsible for %d words' % (num_threads, step)) # Wait until all threads finish for thread in threads: thread.join() # Acquire ALL the translations! gold = dict() for single_gold in thread_results: while not single_gold.empty(): word, translations = single_gold.get() gold[word] = translations log.info('Found English translations') log.info('Loading English word2vec model') english_model = Word2Vec.load_word2vec_format('../corpa/english/GoogleNews-vectors-negative300.bin', binary=True) log.info('English model loaded') evaluations = __evaluate_translation(gold, english_model) log.info('Evaluated translations') # What data do I want? I want to know the most likely translation for each word, and I want to print those to a # csv file or something __print_most_likely_translations(evaluations, gold) log.info('Wrote translations to a file')