Esempio n. 1
0
    def sent_translation(self, to_log):
        """
        Evaluation on sentence translation.
        Only available on Europarl, for en - {de, es, fr, it} language pairs.
        """
        lg1 = self.src_dico.lang
        lg2 = self.tgt_dico.lang

        # parameters
        n_keys = 200000
        n_queries = 2000
        n_idf = 300000

        # load europarl data
        if not hasattr(self, 'europarl_data'):
            self.europarl_data = load_europarl_data(
                lg1, lg2, n_max=(n_keys + 2 * n_idf)
            )

        # if no Europarl data for this language pair
        if not self.europarl_data:
            return

        # mapped word embeddings
        src_emb = self.mapping(self.src_emb.weight).data
        tgt_emb = self.tgt_emb.weight.data

        # get idf weights
        idf = get_idf(self.europarl_data, lg1, lg2, n_idf=n_idf)

        for method in ['nn', 'csls_knn_10']:

            # source <- target sentence translation
            results = get_sent_translation_accuracy(
                self.europarl_data,
                self.src_dico.lang, self.src_dico.word2id, src_emb,
                self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb,
                n_keys=n_keys, n_queries=n_queries,
                method=method, idf=idf
            )
            to_log.update([('tgt_to_src_%s-%s' % (k, method), v) for k, v in results])

            # target <- source sentence translation
            results = get_sent_translation_accuracy(
                self.europarl_data,
                self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb,
                self.src_dico.lang, self.src_dico.word2id, src_emb,
                n_keys=n_keys, n_queries=n_queries,
                method=method, idf=idf
            )
            to_log.update([('src_to_tgt_%s-%s' % (k, method), v) for k, v in results])
Esempio n. 2
0
    def sent_translation(self, to_log):
        """
        Evaluation on sentence translation.
        Only available on Europarl, for en - {de, es, fr, it} language pairs.
        """
        lg1 = self.src_dico.lang
        lg2 = self.tgt_dico.lang

        # parameters
        n_keys = 200000
        n_queries = 2000
        n_idf = 300000

        # load europarl data
        if not hasattr(self, 'europarl_data'):
            self.europarl_data = load_europarl_data(
                lg1, lg2, n_max=(n_keys + 2 * n_idf)
            )

        # if no Europarl data for this language pair
        if not self.europarl_data:
            return

        # mapped word embeddings
        src_emb = self.mapping(self.src_emb.weight).data
        tgt_emb = self.tgt_emb.weight.data

        # get idf weights
        idf = get_idf(self.europarl_data, lg1, lg2, n_idf=n_idf)

        for method in ['nn', 'csls_knn_10']:

            # source <- target sentence translation
            results = get_sent_translation_accuracy(
                self.europarl_data,
                self.src_dico.lang, self.src_dico.word2id, src_emb,
                self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb,
                n_keys=n_keys, n_queries=n_queries,
                method=method, idf=idf
            )
            to_log.update([('tgt_to_src_%s-%s' % (k, method), v) for k, v in results])

            # target <- source sentence translation
            results = get_sent_translation_accuracy(
                self.europarl_data,
                self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb,
                self.src_dico.lang, self.src_dico.word2id, src_emb,
                n_keys=n_keys, n_queries=n_queries,
                method=method, idf=idf
            )
            to_log.update([('src_to_tgt_%s-%s' % (k, method), v) for k, v in results])
Esempio n. 3
0
    def sent_translation(self, to_log):
        """
        Evaluation on sentence translation.
        Only available on Europarl, for en - {de, es, fr, it} language pairs.
        """
        lg1 = self.src_dico.lang
        lg2 = self.tgt_dico.lang

        # parameters
        n_keys = 9076
        n_queries = 2000
        n_idf = 300000

        # load europarl data
        if not hasattr(self, 'europarl_data'):
            self.europarl_data = load_bucc_data(lg1,
                                                lg2,
                                                self.params.split,
                                                n_max=(n_keys + 2 * n_idf),
                                                full=True)
            self.gold = load_bucc_labels(lg1, lg2, self.params.split)

        # if no Europarl data for this language pair
        if not self.europarl_data:
            return

        # mapped word embeddings
        src_emb = self.mapping(self.src_emb.weight).data
        tgt_emb = self.tgt_emb.weight.data

        # get idf weights
        idf = get_idf(self.europarl_data, lg1, lg2, n_idf=n_idf)
        for method in ['csls_knn_10']:
            # source <- target sentence translation
            pred_src, results = get_sent_translation_accuracy(
                self.europarl_data,
                self.gold,  # swap cols
                self.src_dico.lang,
                self.src_dico.word2id,
                src_emb,
                self.tgt_dico.lang,
                self.tgt_dico.word2id,
                tgt_emb,
                method=method,
                idf=idf,
                test=(self.params.split == 'test'),
                device=self.params.device)
            to_log.update([('tgt_to_src_%s-%s' % (k, method), v)
                           for k, v in results])

            # target <- source sentence translation
            pred_tgt, results = get_sent_translation_accuracy(
                self.europarl_data,
                self.gold[:, [1, 0]] if self.gold else None,
                self.tgt_dico.lang,
                self.tgt_dico.word2id,
                tgt_emb,
                self.src_dico.lang,
                self.src_dico.word2id,
                src_emb,
                method=method,
                idf=idf,
                test=(self.params.split == 'test'),
                device=self.params.device)
            to_log.update([('src_to_tgt_%s-%s' % (k, method), v)
                           for k, v in results])
            if self.params.split == 'test':
                self.to_file(pred_src, lg1, lg2)
                self.to_file(pred_tgt, lg2, lg1)
Esempio n. 4
0
    def sent_translation(self, to_log, src_lang=None, tgt_lang=None):
        """
        Evaluation on sentence translation.
        If src_lang and tgt_lang are not specified, evaluate all src_langs to tgt_lang
        Only available on Europarl, for en - {de, es, fr, it} language pairs.
        """
        # parameters
        n_keys = 200000
        n_queries = 2000
        n_idf = 300000

        # load europarl data
        if not hasattr(self, 'europarl_data'):
            self.europarl_data = {}

        # evaluate all src langs to tgt_lang by default
        if src_lang is None and tgt_lang is None:
            tgt_lang = self.params.tgt_lang
            for src_lang in self.params.src_langs:
                lang_pair = (src_lang, tgt_lang)
                # load europarl data
                if lang_pair not in self.europarl_data:
                    self.europarl_data[lang_pair] = load_europarl_data(
                        src_lang, tgt_lang, n_max=(n_keys + 2 * n_idf))
                # if no Europarl data for this language pair
                if not self.europarl_data or lang_pair not in self.europarl_data \
                        or self.europarl_data[lang_pair] is None:
                    logger.info(
                        f'Europarl data not found for {src_lang}-{tgt_lang}.')
                    continue

                # mapped word embeddings
                src_emb = apply_mapping(self.mappings[src_lang],
                                        self.embs[src_lang].weight)
                tgt_emb = self.embs[tgt_lang].weight

                # get idf weights
                idf = get_idf(self.europarl_data[lang_pair],
                              src_lang,
                              tgt_lang,
                              n_idf=n_idf)

                for method in ['nn', 'csls_knn_10']:
                    # source <- target sentence translation
                    results = get_sent_translation_accuracy(
                        self.europarl_data[lang_pair],
                        src_lang,
                        self.vocabs[src_lang].word2id,
                        src_emb,
                        tgt_lang,
                        self.vocabs[tgt_lang].word2id,
                        tgt_emb,
                        n_keys=n_keys,
                        n_queries=n_queries,
                        method=method,
                        idf=idf)
                    to_log.update([
                        ('%s_to_%s_%s-%s' % (tgt_lang, src_lang, k, method), v)
                        for k, v in results
                    ])
                    # target <- source sentence translation
                    results = get_sent_translation_accuracy(
                        self.europarl_data[lang_pair],
                        tgt_lang,
                        self.vocabs[tgt_lang].word2id,
                        tgt_emb,
                        src_lang,
                        self.vocabs[src_lang].word2id,
                        src_emb,
                        n_keys=n_keys,
                        n_queries=n_queries,
                        method=method,
                        idf=idf)
                    to_log.update([
                        ('%s_to_%s_%s-%s' % (src_lang, tgt_lang, k, method), v)
                        for k, v in results
                    ])
        else:
            # only evaluate src_lang to tgt_lang; bridge as necessary
            assert src_lang is not None and tgt_lang is not None
            lang_pair = (src_lang, tgt_lang)
            # load europarl data
            if lang_pair not in self.europarl_data:
                self.europarl_data[lang_pair] = load_europarl_data(
                    src_lang, tgt_lang, n_max=(n_keys + 2 * n_idf))
            # if no Europarl data for this language pair
            if not self.europarl_data or lang_pair not in self.europarl_data \
                    or self.europarl_data[lang_pair] is None:
                logger.info(
                    f'Europarl data not found for {src_lang}-{tgt_lang}.')
                return
            # encode src
            src_emb = apply_mapping(self.mappings[src_lang],
                                    self.embs[src_lang].weight)
            # encode tgt
            tgt_emb = apply_mapping(self.mappings[tgt_lang],
                                    self.embs[tgt_lang].weight)
            # get idf weights
            idf = get_idf(self.europarl_data[lang_pair],
                          src_lang,
                          tgt_lang,
                          n_idf=n_idf)

            for method in ['nn', 'csls_knn_10']:
                # source <- target sentence translation
                results = get_sent_translation_accuracy(
                    self.europarl_data[lang_pair],
                    src_lang,
                    self.vocabs[src_lang].word2id,
                    src_emb,
                    tgt_lang,
                    self.vocabs[tgt_lang].word2id,
                    tgt_emb,
                    n_keys=n_keys,
                    n_queries=n_queries,
                    method=method,
                    idf=idf)
                to_log.update([
                    ('%s_to_%s_%s-%s' % (tgt_lang, src_lang, k, method), v)
                    for k, v in results
                ])
                # target <- source sentence translation
                results = get_sent_translation_accuracy(
                    self.europarl_data[lang_pair],
                    tgt_lang,
                    self.vocabs[tgt_lang].word2id,
                    tgt_emb,
                    src_lang,
                    self.vocabs[src_lang].word2id,
                    src_emb,
                    n_keys=n_keys,
                    n_queries=n_queries,
                    method=method,
                    idf=idf)
                to_log.update([
                    ('%s_to_%s_%s-%s' % (src_lang, tgt_lang, k, method), v)
                    for k, v in results
                ])