Ejemplo n.º 1
0
def Mine(src_doc_ind, trg_doc_ind, src, trg, encoding, src_embeddings,
         trg_embeddings, output, unify, mode, retrieval, margin, neighborhood,
         gpu, dim, threshold, verbose):
    print('LASER: tool to search, score or mine bitexts', file=sys.stderr)
    if gpu:
        print(' - knn will run on all available GPUs (recommended)',
              file=sys.stderr)
    else:
        print(' - knn will run on CPU (slow)', file=sys.stderr)

    args = AttrDict({"encoding": encoding, "unify": unify, "verbose": verbose})
    src_inds, src_sents = TextLoadUnify(src, args)
    trg_inds, trg_sents = TextLoadUnify(trg, args)

    def unique_embeddings(emb, ind, verbose=False):
        aux = {j: i for i, j in enumerate(ind)}
        if verbose:
            print(' - unify embeddings: {:d} -> {:d}'.format(
                len(emb), len(aux)),
                  file=sys.stderr)
        return emb[[aux[i] for i in range(len(aux))]]

    # load the embeddings
    x = EmbedLoad(src_embeddings, dim, verbose=verbose)
    if unify:
        x = unique_embeddings(x, src_inds, verbose)
    faiss.normalize_L2(x)
    y = EmbedLoad(trg_embeddings, dim, verbose=verbose)
    if unify:
        y = unique_embeddings(y, trg_inds, verbose)
    faiss.normalize_L2(y)

    # calculate knn in both directions
    if retrieval != 'bwd':
        if verbose:
            print(' - perform {:d}-nn source against target'.format(
                neighborhood),
                  file=sys.stderr)
        x2y_sim, x2y_ind = knn(x, y, min(y.shape[0], neighborhood), gpu)
        x2y_mean = x2y_sim.mean(axis=1)

    if retrieval != 'fwd':
        if verbose:
            print(' - perform {:d}-nn target against source'.format(
                neighborhood),
                  file=sys.stderr)
        y2x_sim, y2x_ind = knn(y, x, min(x.shape[0], neighborhood), gpu)
        y2x_mean = y2x_sim.mean(axis=1)

    # margin function
    if margin == 'absolute':

        def margin(a, b):
            return a
    elif margin == 'distance':

        def margin(a, b):
            return a - b
    else:  # margin == 'ratio':

        def margin(a, b):
            return a / b

    if output:
        if output.endswith('.xz'):
            fout = lzma.open(output,
                             mode='at',
                             encoding=encoding,
                             errors='surrogateescape')
        else:
            fout = open(output,
                        mode='a',
                        encoding=encoding,
                        errors='surrogateescape')
    else:
        output = "stdout"
        fout = sys.stdout

    if mode == 'search':
        if verbose:
            print(' - Searching for closest sentences in target',
                  file=sys.stderr)
            print(' - writing alignments to {:s}'.format(output),
                  file=sys.stderr)
        scores = score_candidates(x, y, x2y_ind, x2y_mean, y2x_mean, margin,
                                  verbose)
        best = x2y_ind[np.arange(x.shape[0]), scores.argmax(axis=1)]

        nbex = x.shape[0]
        ref = np.linspace(0, nbex - 1, nbex).astype(int)  # [0, nbex)
        err = nbex - np.equal(best.reshape(nbex), ref).astype(int).sum()
        print(' - errors: {:d}={:.2f}%'.format(err, 100 * err / nbex),
              file=sys.stderr)
        for i in src_inds:
            print(trg_sents[best[i]], file=fout)

    elif mode == 'score':
        for i, j in zip(src_inds, trg_inds):
            s = score(x[i], y[j], x2y_mean[i], y2x_mean[j], margin)
            print(s, src_sents[i], trg_sents[j], sep='\t', file=fout)

    elif mode == 'mine':
        if verbose:
            print(' - mining for parallel data', file=sys.stderr)
        fwd_scores = score_candidates(x, y, x2y_ind, x2y_mean, y2x_mean,
                                      margin, verbose)
        bwd_scores = score_candidates(y, x, y2x_ind, y2x_mean, x2y_mean,
                                      margin, verbose)
        fwd_best = x2y_ind[np.arange(x.shape[0]), fwd_scores.argmax(axis=1)]
        bwd_best = y2x_ind[np.arange(y.shape[0]), bwd_scores.argmax(axis=1)]
        if verbose:
            print(' - writing alignments to {:s}'.format(output),
                  file=sys.stderr)
            if threshold > 0:
                print(' - with threshold of {:f}'.format(threshold),
                      file=sys.stderr)
        if retrieval == 'fwd':
            for i, j in enumerate(fwd_best):
                print(fwd_scores[i].max(),
                      src_sents[i],
                      trg_sents[j],
                      sep='\t',
                      file=fout)
        if retrieval == 'bwd':
            for j, i in enumerate(bwd_best):
                print(bwd_scores[j].max(),
                      src_sents[i],
                      trg_sents[j],
                      sep='\t',
                      file=fout)
        if retrieval == 'intersect':
            for i, j in enumerate(fwd_best):
                if bwd_best[j] == i:
                    print(fwd_scores[i].max(),
                          src_sents[i],
                          trg_sents[j],
                          sep='\t',
                          file=fout)
        if retrieval == 'max':
            indices = np.stack(
                (np.concatenate((np.arange(x.shape[0]), bwd_best)),
                 np.concatenate((fwd_best, np.arange(y.shape[0])))),
                axis=1)
            scores = np.concatenate(
                (fwd_scores.max(axis=1), bwd_scores.max(axis=1)))
            seen_src, seen_trg = set(), set()
            for i in np.argsort(-scores):
                src_ind, trg_ind = indices[i]
                if src_ind not in seen_src and trg_ind not in seen_trg:
                    seen_src.add(src_ind)
                    seen_trg.add(trg_ind)
                    if scores[i] > threshold:
                        print(src_doc_ind,
                              trg_doc_ind,
                              src_sents[src_ind],
                              trg_sents[trg_ind],
                              scores[i],
                              sep='\t',
                              file=fout)

    if fout != sys.stdout:
        fout.close()
Ejemplo n.º 2
0
    def exec_function(self, args):

        setCUDA_VISIBLE_DEVICES(args.gpuid)

        bpeCodesF_local = LASER + '/models/93langs.fcodes'
        encoderF_local = LASER + '/models/bilstm.93langs.2018-12-26.pt'

        #################
        # Parse arguments and retrieve files
        #################
        srcF_local = os.path.join(self._data_dir, self._storage.split(args.srcfile)[-1])
        tgtF_local = os.path.join(self._data_dir, self._storage.split(args.tgtfile)[-1])
        self._storage.get_file(args.srcfile, srcF_local)
        self._storage.get_file(args.tgtfile, tgtF_local)

        outputF_local = os.path.join(self._data_dir, self._storage.split(args.output)[-1])

        if args.bpecodes is not None:
            bpeCodesF_local = os.path.join(self._data_dir, self._storage.split(args.bpecodes)[-1])
            self._storage.get_file(args.bpecodes, bpeCodesF_local)
        if args.encoder is not None:
            encoderF_local = os.path.join(self._data_dir, self._storage.split(args.encoder)[-1])
            self._storage.get_file(args.encoder, encoderF_local)

        if args.srclang is None:
            args.srclang = inferLangFromFilename(args.srcfile)
        if args.tgtlang is None:
            args.tgtlang = inferLangFromFilename(args.tgtfile)

        logger.info("srclang: %s, srcfile: %s (%s)" % (args.srclang, args.srcfile, srcF_local))
        logger.info("tgtlang: %s, tgtfile: %s (%s)" % (args.tgtlang, args.tgtfile, tgtF_local))
        logger.info("output: %s (%s)" % (args.output, outputF_local))
        logger.info("encoderF: %s (%s)" % (args.encoder, encoderF_local))
        logger.info("bpeCodesF: %s (%s)" % (args.bpecodes, bpeCodesF_local))

        #################
        # Perform tasks
        #################
        with tempfile.TemporaryDirectory() as tmpdir:
            srcTokF = os.path.join(tmpdir, 'srctok')
            srcBpeF = os.path.join(tmpdir, 'srcbpe')
            srcEmbF = os.path.join(tmpdir, 'srcemb')

            tgtTokF = os.path.join(tmpdir, 'tgttok')
            tgtBpeF = os.path.join(tmpdir, 'tgtbpe')
            tgtEmbF = os.path.join(tmpdir, 'tgtemb')

            logger.debug(' - gpuid: %s' % args.gpuid)

            if isinstance(args.gpuid, list):
                logger.debug(' - perform src and tgt embedding in parallel')

                import torch.multiprocessing as mp

                srcP = mp.Process(target=TokBpeEmb, args=(args.srclang, srcF_local, srcTokF, srcBpeF, srcEmbF,
                                  bpeCodesF_local, encoderF_local, args.encoderbuffersize, args.encodermaxtokens,
                                  args.verbose, args.gpuid[0]))
                srcP.start()

                tgtP = mp.Process(target=TokBpeEmb, args=(args.tgtlang, tgtF_local, tgtTokF, tgtBpeF, tgtEmbF,
                                  bpeCodesF_local, encoderF_local, args.encoderbuffersize, args.encodermaxtokens,
                                  args.verbose, args.gpuid[1]))
                tgtP.start()

                srcP.join()
                tgtP.join()

            else:
                logger.info(' - perform src and tgt embedding in series')
                encoder = loadEncoder(encoderF_local, args.encoderbuffersize, args.encodermaxtokens,
                                      cpu=(args.gpuid == 0))
                TokBpeEmb(args.srclang, srcF_local, srcTokF, srcBpeF, srcEmbF, bpeCodesF_local, encoder,
                          args.encoderbuffersize, args.encodermaxtokens, args.verbose, args.gpuid)
                TokBpeEmb(args.tgtlang, tgtF_local, tgtTokF, tgtBpeF, tgtEmbF, bpeCodesF_local, encoder,
                          args.encoderbuffersize, args.encodermaxtokens, args.verbose, args.gpuid)

            # LASER options
            setCUDA_VISIBLE_DEVICES(args.gpuid)
            unify, retrieval, neighborhood, gpu = True, 'max', 5, (args.gpuid != 0)

            # load bitext and embeddings
            def _loadTextAndEmb(textF, encoding, embF, encoderDim, unify, verbose):
                inds, sents = TextLoadUnify(textF, encoding, unify, verbose)
                emb = EmbedLoad(embF, encoderDim, verbose=verbose)
                if unify:
                    emb = unique_embeddings(emb, inds)
                faiss.normalize_L2(emb)
                return inds, sents, emb

            src_inds, src_sents, x = _loadTextAndEmb(srcF_local, args.encoding, srcEmbF, args.encoderdim, unify, args.verbose)
            trg_inds, trg_sents, y = _loadTextAndEmb(tgtF_local, args.encoding, tgtEmbF, args.encoderdim, unify, args.verbose)

            # calculate knn in both directions
            if retrieval != 'bwd':
                logger.info(' - perform {:d}-nn source against target'.format(neighborhood))
                x2y_sim, x2y_ind = knn(x, y, min(y.shape[0], neighborhood), gpu)
                x2y_mean = x2y_sim.mean(axis=1)

            if retrieval != 'fwd':
                logger.info(' - perform {:d}-nn target against source'.format(neighborhood))
                y2x_sim, y2x_ind = knn(y, x, min(x.shape[0], neighborhood), gpu)
                y2x_mean = y2x_sim.mean(axis=1)

            # margin function
            if args.margin == 'absolute':
                margin = lambda a, b: a
            elif args.margin == 'distance':
                margin = lambda a, b: a - b
            else:
                # args.margin == 'ratio':
                margin = lambda a, b: a / b

            if args.tumode == 'score':
                scoreBitext(src_inds, trg_inds, x, y, x2y_mean, y2x_mean, outputF_local, args.encoding, margin)
                self._storage.push(outputF_local, args.output)
                statCnt, statMin, statMax, statAvg, statStddev = getScoreDist(outputF_local)


            elif args.tumode == 'mine':
                src_suffix, tgt_suffix = '', ''
                if args.srclang == args.tgtlang:
                    src_suffix, tgt_suffix = "_s", "_t"

                foutSrc, foutSrc_remote = outputF_local+'.'+args.srclang+src_suffix, args.output+'.'+args.srclang+src_suffix
                if srcF_local.endswith('.gz'):
                    foutSrc = foutSrc+'.gz'
                    foutSrc_remote = foutSrc_remote+'.gz'

                foutTgt, foutTgt_remote = outputF_local+'.'+args.tgtlang+tgt_suffix, args.output+'.'+args.tgtlang+tgt_suffix
                if tgtF_local.endswith('.gz'):
                    foutTgt = foutTgt+'.gz'
                    foutTgt_remote = foutTgt_remote+'.gz'

                foutScore, foutScore_remote = outputF_local+'.tuminer-score', args.output+'.tuminer-score'

                mineBitext(src_sents, trg_sents, x, y, x2y_ind, x2y_mean, y2x_ind, y2x_mean, foutSrc, foutTgt, foutScore,
                           args.encoding, margin, retrieval, args.threshold, args.verbose)

                self._storage.push(foutSrc, foutSrc_remote)
                self._storage.push(foutTgt, foutTgt_remote)
                self._storage.push(foutScore, foutScore_remote)

                statCnt, statMin, statMax, statAvg, statStddev = getScoreDist(foutScore)


            logger.info('Score statistics -- CNT: {:d}, MIN: {:f}, MAX: {:f}, AVG: {:f}, STDDEV: {:f}'
                        .format(statCnt, statMin, statMax, statAvg, statStddev))