コード例 #1
0
 def __init__(self):
     self.modelpath = os.path.join(LASER,
                                   "models/bilstm.93langs.2018-12-26.pt")
     # 不需要设置batch_size 根据max_tokens自动调整
     self.encoder = SentenceEncoder(self.modelpath,
                                    max_sentences=None,
                                    max_tokens=12000,
                                    sort_kind='quicksort',
                                    cpu=False)
     self.batch_size = 64
コード例 #2
0
def loadEncoder(encoderF, buffer_size, max_tokens, max_sentences=None, cpu=False, stable=False):
    buffer_size = max(buffer_size, 1)
    assert not max_sentences or max_sentences <= buffer_size, '--max-sentences/--batch-size ' \
                                                              'cannot be larger than --buffer-size'

    logger.info(' - Encoder: loading {} - cpu={}'.format(encoderF, cpu))
    return SentenceEncoder(encoderF,
                           max_sentences=max_sentences,
                           max_tokens=max_tokens,
                           sort_kind='mergesort' if stable else 'quicksort',
                           cpu=cpu)
コード例 #3
0
def loadEncoder(encoderF,
                buffer_size,
                max_tokens,
                max_sentences=None,
                cpu=False,
                stable=False):
    buffer_size = max(buffer_size, 1)
    assert not max_sentences or max_sentences <= buffer_size, (
        "--max-sentences/--batch-size "
        "cannot be larger than --buffer-size")

    logger.info(" - Encoder: loading {} - cpu={}".format(encoderF, cpu))
    return SentenceEncoder(
        encoderF,
        max_sentences=max_sentences,
        max_tokens=max_tokens,
        sort_kind="mergesort" if stable else "quicksort",
        cpu=cpu,
    )
コード例 #4
0
ファイル: laser_mine.py プロジェクト: hiropppe/abc
def mine(src, tgt, offset, slang, tlang, token_slang, token_tlang, encoder,
         bpe_codes, buffer_size, max_tokens, max_sentences, enc_cpu, encoding,
         mode, neighborhood, margin, retrieval, unify, knn_gpu, stable, dim,
         threshold, verbose, output):

    buffer_size = max(buffer_size, 1)
    assert not max_sentences or max_sentences <= buffer_size, \
        '--max-sentences/--batch-size cannot be larger than --buffer-size'

    if verbose:
        print(' - Encoder: loading {}'.format(encoder))

    encoder = SentenceEncoder(encoder,
                              max_sentences=max_sentences,
                              max_tokens=max_tokens,
                              sort_kind='mergesort' if stable else 'quicksort',
                              cpu=enc_cpu)

    if offset:
        src_sents = [s.strip() for s in open(src)]
        tgt_sents = [t.strip() for t in open(tgt)]
        # The doc-index starts at 1 for use with the -n option of sed
        doc_offset = [(int(d[0]), d[1], d[2], int(d[3]), d[4], d[5])
                      for d in [line.strip().split() for line in open(offset)]]

        with tempfile.TemporaryDirectory() as tmpdir:
            tmpdir_path = Path(tmpdir)

            src_tmpdir_path = tmpdir_path / slang
            tgt_tmpdir_path = tmpdir_path / tlang

            src_tmpdir_path.mkdir()
            tgt_tmpdir_path.mkdir()
            for s_ind, s_off, s_len, t_ind, t_off, t_len in doc_offset:
                src_txt = src_tmpdir_path / 'txt'
                tgt_txt = tgt_tmpdir_path / 'txt'

                with open(src_txt, "w") as fw:
                    print("\n".join(src_sents), file=fw)
                with open(tgt_txt, "w") as fw:
                    print("\n".join(tgt_sents), file=fw)

                src_embeddings = Embed(src_tmpdir_path.__str__(),
                                       src_txt.__str__(), encoder,
                                       slang if token_slang else "--",
                                       bpe_codes, buffer_size, verbose)
                tgt_embeddings = Embed(tgt_tmpdir_path.__str__(),
                                       tgt_txt.__str__(), encoder,
                                       tlang if token_tlang else "--",
                                       bpe_codes, buffer_size, verbose)

                # mine_output = tmpdir_path / "mine"

                Mine(
                    s_ind,
                    t_ind,
                    src_txt.__str__(),
                    tgt_txt.__str__(),
                    encoding,
                    src_embeddings,
                    tgt_embeddings,
                    output,
                    # mine_output.__str__(),
                    unify,
                    mode,
                    retrieval,
                    margin,
                    neighborhood,
                    knn_gpu,
                    dim,
                    threshold,
                    verbose)

    else:
        src_embeddings = Embed(src, slang if token_slang else "--")
        src_embeddings = Embed(tgt, tlang if token_tlang else "--")
コード例 #5
0
def generate_encoder(encoder_file):
    return SentenceEncoder(encoder_file,
                           max_sentences=None,
                           max_tokens=12000,
                           sort_kind='quicksort',
                           cpu=True)
コード例 #6
0
class LaserHelper:
    def __init__(self):
        self.modelpath = os.path.join(LASER,
                                      "models/bilstm.93langs.2018-12-26.pt")
        # 不需要设置batch_size 根据max_tokens自动调整
        self.encoder = SentenceEncoder(self.modelpath,
                                       max_sentences=None,
                                       max_tokens=12000,
                                       sort_kind='quicksort',
                                       cpu=False)
        self.batch_size = 64

    def _cosine(self, vectors_1, vectors_2):
        '''
        :param vector1s:是一个二维的numpy数组
        :param vector2s:是一个二维的numpy数组
        :return: 返回的是一个list
        '''
        molecule = np.sum(vectors_1 * vectors_2, axis=1)
        denominator_1 = np.linalg.norm(vectors_1, axis=1)
        denominator_2 = np.linalg.norm(vectors_2, axis=1)
        denominator = denominator_1 * denominator_2
        scores = molecule / denominator
        return scores

    def _getSensvec(self, sens):
        '''
        :param sens: 是一个句子的列表
        :return:  返回的是每个句子的的句子向量
        '''
        res = self.encoder.encode_sentences(sens)
        return res

    def _calCosine(self, sens_1, sens_2):
        senvecs_1 = self._getSensvec(sens_1)
        senvecs_2 = self._getSensvec(sens_2)
        scores = self._cosine(senvecs_1, senvecs_2)

        return scores

    def _calL2(self, sens_1, sens_2):
        senvecs_1 = self._getSensvec(sens_1)
        senvecs_2 = self._getSensvec(sens_2)
        scores = np.sqrt(np.sum(np.square(senvecs_1 - senvecs_2), axis=1))
        return scores

    def calL2andSave(self,
                     src_tmp="../tmp/src.bpe",
                     tgt_tmp="../tmp/tgt.bpe",
                     outputfile="../src_tgt_scores_laser.txt"):
        fr_src = open(src_tmp, mode="r", encoding="utf-8")
        fr_tgt = open(tgt_tmp, mode="r", encoding="utf-8")

        fw = open(outputfile, mode="w", encoding="utf-8")

        sens_1 = []
        sens_2 = []

        for src, tgt in zip(fr_src, fr_tgt):
            src = src.strip()
            tgt = tgt.strip()
            assert src != ""
            assert tgt != ""
            if len(sens_1) == self.batch_size:
                scores = self._calL2(sens_1, sens_2)
                for s1, s2, s in zip(sens_1, sens_2, scores):
                    s1 = s1.replace("@@", "").replace(" ", "")
                    s2 = s2.replace("@@", "").replace(" ", "")
                    newline = "---xhm---".join([s1, s2, str(s)]) + "\n"
                    fw.write(newline)

                sens_1 = []
                sens_2 = []

            sens_1.append(src)
            sens_2.append(tgt)

        if len(sens_1) > 0:
            scores = self._calL2(sens_1, sens_2)
            for s1, s2, s in zip(sens_1, sens_2, scores):
                s1 = s1.replace("@@", "").replace(" ", "")
                s2 = s2.replace("@@", "").replace(" ", "")
                newline = "---xhm---".join([s1, s2, str(s)]) + "\n"
                fw.write(newline)
        fr_tgt.close()
        fr_src.close()
        fw.close()

    def extractParaphrasePair(self,
                              inputfile="../src_tgt_scores_laser.txt",
                              outputfile="../src_tgt_",
                              thresold=0.3):

        outputfile = outputfile + str(thresold) + ".txt"

        fw = open(outputfile, mode="w", encoding="utf-8")

        with open(inputfile, mode="r", encoding="utf-8") as fr:
            for line in fr:
                line = line.strip()
                if line != "":
                    sen1, sen2, score = line.split("---xhm---")
                    if float(score) < thresold:
                        fw.write("---xhm--".join([sen1, sen2]) + "\n")

        fw.close()
コード例 #7
0
 def get_laser_encoder(encoder_path, max_tokens=12000):
     return SentenceEncoder(encoder_path,
                            max_sentences=None,
                            max_tokens=max_tokens,
                            cpu=False)
コード例 #8
0
from embed import SentenceEncoder
from scipy import spatial
from text_processing import TokenLine, BPEfastApplyLine

encoder = SentenceEncoder("/LASER/models/bilstm.93langs.2018-12-26.pt",
                          cpu=True,
                          max_sentences=None,
                          max_tokens=12000,
                          sort_kind="mergesort")

sentences = [{
    "lang":
    "en",
    "content":
    "While the question of hamsters powering homes may seem a bit farcical, it should be noted that at one point humans did specifically breed a certain type of dog for the sole purpose of it just walking along at a steady pace on a giant wheel… (See our article The Curious Tale of Turnspit Dogs.)",
}, {
    "lang": "zh",
    "content": "靠仓鼠产电功能这个问题似乎有些可笑,但却前有古人,人类曾经靠驱使狗狗跑轮子来烧炉子..."
}]


def pre_process(sentence, lang):
    # tokenrize
    sentence = TokenLine(sentence,
                         lang=lang,
                         romanize=True if lang == 'el' else False,
                         lower_case=True)
    # bpe
    return BPEfastApplyLine(sentence, bpe_codes="/LASER/models/93langs.fcodes")

コード例 #9
0
parser.add_argument(
    '--lang', '-L', nargs='+', default=None,
    help="List of languages to test on")
parser.add_argument('--buffer-size', type=int, default=10000,
    help='Buffer size (sentences)')
parser.add_argument('--max-tokens', type=int, default=12000,
    help='Maximum number of tokens to process in a batch')
parser.add_argument('--max-sentences', type=int, default=None,
    help='Maximum number of sentences to process in a batch')
parser.add_argument('--cpu', action='store_true',
    help='Use CPU instead of GPU')

args = parser.parse_args()

print('LASER: embedding single sentence')
'''

if __name__ == '__main__':
    model_dir = LASER + "/models"
    encoder = model_dir + "/bilstm.93langs.2018-12-26.pt"
    bpe_codes = model_dir + "/93langs.fcodes"
    enc = SentenceEncoder(encoder, cpu=True)
    loaded_bpe=BPEfastLoad('',bpe_codes)

    line = 'Testing to encode line'
    print("Embedding line", line)
    
    embedded = embedLine(line,enc,loaded_bpe)
    print("Finished Embedding")
    print("Embedded line len=", len(embedded))