def sent_translation(self, to_log): """ Evaluation on sentence translation. Only available on Europarl, for en - {de, es, fr, it} language pairs. """ lg1 = self.src_dico.lang lg2 = self.tgt_dico.lang # parameters n_keys = 200000 n_queries = 2000 n_idf = 300000 # load europarl data if not hasattr(self, 'europarl_data'): self.europarl_data = load_europarl_data( lg1, lg2, n_max=(n_keys + 2 * n_idf) ) # if no Europarl data for this language pair if not self.europarl_data: return # mapped word embeddings src_emb = self.mapping(self.src_emb.weight).data tgt_emb = self.tgt_emb.weight.data # get idf weights idf = get_idf(self.europarl_data, lg1, lg2, n_idf=n_idf) for method in ['nn', 'csls_knn_10']: # source <- target sentence translation results = get_sent_translation_accuracy( self.europarl_data, self.src_dico.lang, self.src_dico.word2id, src_emb, self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb, n_keys=n_keys, n_queries=n_queries, method=method, idf=idf ) to_log.update([('tgt_to_src_%s-%s' % (k, method), v) for k, v in results]) # target <- source sentence translation results = get_sent_translation_accuracy( self.europarl_data, self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb, self.src_dico.lang, self.src_dico.word2id, src_emb, n_keys=n_keys, n_queries=n_queries, method=method, idf=idf ) to_log.update([('src_to_tgt_%s-%s' % (k, method), v) for k, v in results])
def sent_translation(self, to_log): """ Evaluation on sentence translation. Only available on Europarl, for en - {de, es, fr, it} language pairs. """ lg1 = self.src_dico.lang lg2 = self.tgt_dico.lang # parameters n_keys = 200000 n_queries = 2000 n_idf = 300000 # load europarl data if not hasattr(self, 'europarl_data'): self.europarl_data = load_europarl_data( lg1, lg2, n_max=(n_keys + 2 * n_idf) ) # if no Europarl data for this language pair if not self.europarl_data: return # mapped word embeddings src_emb = self.mapping(self.src_emb.weight).data tgt_emb = self.tgt_emb.weight.data # get idf weights idf = get_idf(self.europarl_data, lg1, lg2, n_idf=n_idf) for method in ['nn', 'csls_knn_10']: # source <- target sentence translation results = get_sent_translation_accuracy( self.europarl_data, self.src_dico.lang, self.src_dico.word2id, src_emb, self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb, n_keys=n_keys, n_queries=n_queries, method=method, idf=idf ) to_log.update([('tgt_to_src_%s-%s' % (k, method), v) for k, v in results]) # target <- source sentence translation results = get_sent_translation_accuracy( self.europarl_data, self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb, self.src_dico.lang, self.src_dico.word2id, src_emb, n_keys=n_keys, n_queries=n_queries, method=method, idf=idf ) to_log.update([('src_to_tgt_%s-%s' % (k, method), v) for k, v in results])
def sent_translation(self, to_log): """ Evaluation on sentence translation. Only available on Europarl, for en - {de, es, fr, it} language pairs. """ lg1 = self.src_dico.lang lg2 = self.tgt_dico.lang # parameters n_keys = 9076 n_queries = 2000 n_idf = 300000 # load europarl data if not hasattr(self, 'europarl_data'): self.europarl_data = load_bucc_data(lg1, lg2, self.params.split, n_max=(n_keys + 2 * n_idf), full=True) self.gold = load_bucc_labels(lg1, lg2, self.params.split) # if no Europarl data for this language pair if not self.europarl_data: return # mapped word embeddings src_emb = self.mapping(self.src_emb.weight).data tgt_emb = self.tgt_emb.weight.data # get idf weights idf = get_idf(self.europarl_data, lg1, lg2, n_idf=n_idf) for method in ['csls_knn_10']: # source <- target sentence translation pred_src, results = get_sent_translation_accuracy( self.europarl_data, self.gold, # swap cols self.src_dico.lang, self.src_dico.word2id, src_emb, self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb, method=method, idf=idf, test=(self.params.split == 'test'), device=self.params.device) to_log.update([('tgt_to_src_%s-%s' % (k, method), v) for k, v in results]) # target <- source sentence translation pred_tgt, results = get_sent_translation_accuracy( self.europarl_data, self.gold[:, [1, 0]] if self.gold else None, self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb, self.src_dico.lang, self.src_dico.word2id, src_emb, method=method, idf=idf, test=(self.params.split == 'test'), device=self.params.device) to_log.update([('src_to_tgt_%s-%s' % (k, method), v) for k, v in results]) if self.params.split == 'test': self.to_file(pred_src, lg1, lg2) self.to_file(pred_tgt, lg2, lg1)
def sent_translation(self, to_log, src_lang=None, tgt_lang=None): """ Evaluation on sentence translation. If src_lang and tgt_lang are not specified, evaluate all src_langs to tgt_lang Only available on Europarl, for en - {de, es, fr, it} language pairs. """ # parameters n_keys = 200000 n_queries = 2000 n_idf = 300000 # load europarl data if not hasattr(self, 'europarl_data'): self.europarl_data = {} # evaluate all src langs to tgt_lang by default if src_lang is None and tgt_lang is None: tgt_lang = self.params.tgt_lang for src_lang in self.params.src_langs: lang_pair = (src_lang, tgt_lang) # load europarl data if lang_pair not in self.europarl_data: self.europarl_data[lang_pair] = load_europarl_data( src_lang, tgt_lang, n_max=(n_keys + 2 * n_idf)) # if no Europarl data for this language pair if not self.europarl_data or lang_pair not in self.europarl_data \ or self.europarl_data[lang_pair] is None: logger.info( f'Europarl data not found for {src_lang}-{tgt_lang}.') continue # mapped word embeddings src_emb = apply_mapping(self.mappings[src_lang], self.embs[src_lang].weight) tgt_emb = self.embs[tgt_lang].weight # get idf weights idf = get_idf(self.europarl_data[lang_pair], src_lang, tgt_lang, n_idf=n_idf) for method in ['nn', 'csls_knn_10']: # source <- target sentence translation results = get_sent_translation_accuracy( self.europarl_data[lang_pair], src_lang, self.vocabs[src_lang].word2id, src_emb, tgt_lang, self.vocabs[tgt_lang].word2id, tgt_emb, n_keys=n_keys, n_queries=n_queries, method=method, idf=idf) to_log.update([ ('%s_to_%s_%s-%s' % (tgt_lang, src_lang, k, method), v) for k, v in results ]) # target <- source sentence translation results = get_sent_translation_accuracy( self.europarl_data[lang_pair], tgt_lang, self.vocabs[tgt_lang].word2id, tgt_emb, src_lang, self.vocabs[src_lang].word2id, src_emb, n_keys=n_keys, n_queries=n_queries, method=method, idf=idf) to_log.update([ ('%s_to_%s_%s-%s' % (src_lang, tgt_lang, k, method), v) for k, v in results ]) else: # only evaluate src_lang to tgt_lang; bridge as necessary assert src_lang is not None and tgt_lang is not None lang_pair = (src_lang, tgt_lang) # load europarl data if lang_pair not in self.europarl_data: self.europarl_data[lang_pair] = load_europarl_data( src_lang, tgt_lang, n_max=(n_keys + 2 * n_idf)) # if no Europarl data for this language pair if not self.europarl_data or lang_pair not in self.europarl_data \ or self.europarl_data[lang_pair] is None: logger.info( f'Europarl data not found for {src_lang}-{tgt_lang}.') return # encode src src_emb = apply_mapping(self.mappings[src_lang], self.embs[src_lang].weight) # encode tgt tgt_emb = apply_mapping(self.mappings[tgt_lang], self.embs[tgt_lang].weight) # get idf weights idf = get_idf(self.europarl_data[lang_pair], src_lang, tgt_lang, n_idf=n_idf) for method in ['nn', 'csls_knn_10']: # source <- target sentence translation results = get_sent_translation_accuracy( self.europarl_data[lang_pair], src_lang, self.vocabs[src_lang].word2id, src_emb, tgt_lang, self.vocabs[tgt_lang].word2id, tgt_emb, n_keys=n_keys, n_queries=n_queries, method=method, idf=idf) to_log.update([ ('%s_to_%s_%s-%s' % (tgt_lang, src_lang, k, method), v) for k, v in results ]) # target <- source sentence translation results = get_sent_translation_accuracy( self.europarl_data[lang_pair], tgt_lang, self.vocabs[tgt_lang].word2id, tgt_emb, src_lang, self.vocabs[src_lang].word2id, src_emb, n_keys=n_keys, n_queries=n_queries, method=method, idf=idf) to_log.update([ ('%s_to_%s_%s-%s' % (src_lang, tgt_lang, k, method), v) for k, v in results ])