コード例 #1
0
 def __init__(self):
     self.modelpath = os.path.join(LASER,
                                   "models/bilstm.93langs.2018-12-26.pt")
     # 不需要设置batch_size 根据max_tokens自动调整
     self.encoder = SentenceEncoder(self.modelpath,
                                    max_sentences=None,
                                    max_tokens=12000,
                                    sort_kind='quicksort',
                                    cpu=False)
     self.batch_size = 64
コード例 #2
0
def loadEncoder(encoderF, buffer_size, max_tokens, max_sentences=None, cpu=False, stable=False):
    buffer_size = max(buffer_size, 1)
    assert not max_sentences or max_sentences <= buffer_size, '--max-sentences/--batch-size ' \
                                                              'cannot be larger than --buffer-size'

    logger.info(' - Encoder: loading {} - cpu={}'.format(encoderF, cpu))
    return SentenceEncoder(encoderF,
                           max_sentences=max_sentences,
                           max_tokens=max_tokens,
                           sort_kind='mergesort' if stable else 'quicksort',
                           cpu=cpu)
コード例 #3
0
def loadEncoder(encoderF,
                buffer_size,
                max_tokens,
                max_sentences=None,
                cpu=False,
                stable=False):
    buffer_size = max(buffer_size, 1)
    assert not max_sentences or max_sentences <= buffer_size, (
        "--max-sentences/--batch-size "
        "cannot be larger than --buffer-size")

    logger.info(" - Encoder: loading {} - cpu={}".format(encoderF, cpu))
    return SentenceEncoder(
        encoderF,
        max_sentences=max_sentences,
        max_tokens=max_tokens,
        sort_kind="mergesort" if stable else "quicksort",
        cpu=cpu,
    )
コード例 #4
0
ファイル: laser_mine.py プロジェクト: hiropppe/abc
def mine(src, tgt, offset, slang, tlang, token_slang, token_tlang, encoder,
         bpe_codes, buffer_size, max_tokens, max_sentences, enc_cpu, encoding,
         mode, neighborhood, margin, retrieval, unify, knn_gpu, stable, dim,
         threshold, verbose, output):

    buffer_size = max(buffer_size, 1)
    assert not max_sentences or max_sentences <= buffer_size, \
        '--max-sentences/--batch-size cannot be larger than --buffer-size'

    if verbose:
        print(' - Encoder: loading {}'.format(encoder))

    encoder = SentenceEncoder(encoder,
                              max_sentences=max_sentences,
                              max_tokens=max_tokens,
                              sort_kind='mergesort' if stable else 'quicksort',
                              cpu=enc_cpu)

    if offset:
        src_sents = [s.strip() for s in open(src)]
        tgt_sents = [t.strip() for t in open(tgt)]
        # The doc-index starts at 1 for use with the -n option of sed
        doc_offset = [(int(d[0]), d[1], d[2], int(d[3]), d[4], d[5])
                      for d in [line.strip().split() for line in open(offset)]]

        with tempfile.TemporaryDirectory() as tmpdir:
            tmpdir_path = Path(tmpdir)

            src_tmpdir_path = tmpdir_path / slang
            tgt_tmpdir_path = tmpdir_path / tlang

            src_tmpdir_path.mkdir()
            tgt_tmpdir_path.mkdir()
            for s_ind, s_off, s_len, t_ind, t_off, t_len in doc_offset:
                src_txt = src_tmpdir_path / 'txt'
                tgt_txt = tgt_tmpdir_path / 'txt'

                with open(src_txt, "w") as fw:
                    print("\n".join(src_sents), file=fw)
                with open(tgt_txt, "w") as fw:
                    print("\n".join(tgt_sents), file=fw)

                src_embeddings = Embed(src_tmpdir_path.__str__(),
                                       src_txt.__str__(), encoder,
                                       slang if token_slang else "--",
                                       bpe_codes, buffer_size, verbose)
                tgt_embeddings = Embed(tgt_tmpdir_path.__str__(),
                                       tgt_txt.__str__(), encoder,
                                       tlang if token_tlang else "--",
                                       bpe_codes, buffer_size, verbose)

                # mine_output = tmpdir_path / "mine"

                Mine(
                    s_ind,
                    t_ind,
                    src_txt.__str__(),
                    tgt_txt.__str__(),
                    encoding,
                    src_embeddings,
                    tgt_embeddings,
                    output,
                    # mine_output.__str__(),
                    unify,
                    mode,
                    retrieval,
                    margin,
                    neighborhood,
                    knn_gpu,
                    dim,
                    threshold,
                    verbose)

    else:
        src_embeddings = Embed(src, slang if token_slang else "--")
        src_embeddings = Embed(tgt, tlang if token_tlang else "--")
コード例 #5
0
def generate_encoder(encoder_file):
    return SentenceEncoder(encoder_file,
                           max_sentences=None,
                           max_tokens=12000,
                           sort_kind='quicksort',
                           cpu=True)
コード例 #6
0
 def get_laser_encoder(encoder_path, max_tokens=12000):
     return SentenceEncoder(encoder_path,
                            max_sentences=None,
                            max_tokens=max_tokens,
                            cpu=False)
コード例 #7
0
from embed import SentenceEncoder
from scipy import spatial
from text_processing import TokenLine, BPEfastApplyLine

encoder = SentenceEncoder("/LASER/models/bilstm.93langs.2018-12-26.pt",
                          cpu=True,
                          max_sentences=None,
                          max_tokens=12000,
                          sort_kind="mergesort")

sentences = [{
    "lang":
    "en",
    "content":
    "While the question of hamsters powering homes may seem a bit farcical, it should be noted that at one point humans did specifically breed a certain type of dog for the sole purpose of it just walking along at a steady pace on a giant wheel… (See our article The Curious Tale of Turnspit Dogs.)",
}, {
    "lang": "zh",
    "content": "靠仓鼠产电功能这个问题似乎有些可笑,但却前有古人,人类曾经靠驱使狗狗跑轮子来烧炉子..."
}]


def pre_process(sentence, lang):
    # tokenrize
    sentence = TokenLine(sentence,
                         lang=lang,
                         romanize=True if lang == 'el' else False,
                         lower_case=True)
    # bpe
    return BPEfastApplyLine(sentence, bpe_codes="/LASER/models/93langs.fcodes")

コード例 #8
0
parser.add_argument(
    '--lang', '-L', nargs='+', default=None,
    help="List of languages to test on")
parser.add_argument('--buffer-size', type=int, default=10000,
    help='Buffer size (sentences)')
parser.add_argument('--max-tokens', type=int, default=12000,
    help='Maximum number of tokens to process in a batch')
parser.add_argument('--max-sentences', type=int, default=None,
    help='Maximum number of sentences to process in a batch')
parser.add_argument('--cpu', action='store_true',
    help='Use CPU instead of GPU')

args = parser.parse_args()

print('LASER: embedding single sentence')
'''

if __name__ == '__main__':
    model_dir = LASER + "/models"
    encoder = model_dir + "/bilstm.93langs.2018-12-26.pt"
    bpe_codes = model_dir + "/93langs.fcodes"
    enc = SentenceEncoder(encoder, cpu=True)
    loaded_bpe=BPEfastLoad('',bpe_codes)

    line = 'Testing to encode line'
    print("Embedding line", line)
    
    embedded = embedLine(line,enc,loaded_bpe)
    print("Finished Embedding")
    print("Embedded line len=", len(embedded))