def __init__(self): self.modelpath = os.path.join(LASER, "models/bilstm.93langs.2018-12-26.pt") # 不需要设置batch_size 根据max_tokens自动调整 self.encoder = SentenceEncoder(self.modelpath, max_sentences=None, max_tokens=12000, sort_kind='quicksort', cpu=False) self.batch_size = 64
def loadEncoder(encoderF, buffer_size, max_tokens, max_sentences=None, cpu=False, stable=False): buffer_size = max(buffer_size, 1) assert not max_sentences or max_sentences <= buffer_size, '--max-sentences/--batch-size ' \ 'cannot be larger than --buffer-size' logger.info(' - Encoder: loading {} - cpu={}'.format(encoderF, cpu)) return SentenceEncoder(encoderF, max_sentences=max_sentences, max_tokens=max_tokens, sort_kind='mergesort' if stable else 'quicksort', cpu=cpu)
def loadEncoder(encoderF, buffer_size, max_tokens, max_sentences=None, cpu=False, stable=False): buffer_size = max(buffer_size, 1) assert not max_sentences or max_sentences <= buffer_size, ( "--max-sentences/--batch-size " "cannot be larger than --buffer-size") logger.info(" - Encoder: loading {} - cpu={}".format(encoderF, cpu)) return SentenceEncoder( encoderF, max_sentences=max_sentences, max_tokens=max_tokens, sort_kind="mergesort" if stable else "quicksort", cpu=cpu, )
def mine(src, tgt, offset, slang, tlang, token_slang, token_tlang, encoder, bpe_codes, buffer_size, max_tokens, max_sentences, enc_cpu, encoding, mode, neighborhood, margin, retrieval, unify, knn_gpu, stable, dim, threshold, verbose, output): buffer_size = max(buffer_size, 1) assert not max_sentences or max_sentences <= buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' if verbose: print(' - Encoder: loading {}'.format(encoder)) encoder = SentenceEncoder(encoder, max_sentences=max_sentences, max_tokens=max_tokens, sort_kind='mergesort' if stable else 'quicksort', cpu=enc_cpu) if offset: src_sents = [s.strip() for s in open(src)] tgt_sents = [t.strip() for t in open(tgt)] # The doc-index starts at 1 for use with the -n option of sed doc_offset = [(int(d[0]), d[1], d[2], int(d[3]), d[4], d[5]) for d in [line.strip().split() for line in open(offset)]] with tempfile.TemporaryDirectory() as tmpdir: tmpdir_path = Path(tmpdir) src_tmpdir_path = tmpdir_path / slang tgt_tmpdir_path = tmpdir_path / tlang src_tmpdir_path.mkdir() tgt_tmpdir_path.mkdir() for s_ind, s_off, s_len, t_ind, t_off, t_len in doc_offset: src_txt = src_tmpdir_path / 'txt' tgt_txt = tgt_tmpdir_path / 'txt' with open(src_txt, "w") as fw: print("\n".join(src_sents), file=fw) with open(tgt_txt, "w") as fw: print("\n".join(tgt_sents), file=fw) src_embeddings = Embed(src_tmpdir_path.__str__(), src_txt.__str__(), encoder, slang if token_slang else "--", bpe_codes, buffer_size, verbose) tgt_embeddings = Embed(tgt_tmpdir_path.__str__(), tgt_txt.__str__(), encoder, tlang if token_tlang else "--", bpe_codes, buffer_size, verbose) # mine_output = tmpdir_path / "mine" Mine( s_ind, t_ind, src_txt.__str__(), tgt_txt.__str__(), encoding, src_embeddings, tgt_embeddings, output, # mine_output.__str__(), unify, mode, retrieval, margin, neighborhood, knn_gpu, dim, threshold, verbose) else: src_embeddings = Embed(src, slang if token_slang else "--") src_embeddings = Embed(tgt, tlang if token_tlang else "--")
def generate_encoder(encoder_file): return SentenceEncoder(encoder_file, max_sentences=None, max_tokens=12000, sort_kind='quicksort', cpu=True)
def get_laser_encoder(encoder_path, max_tokens=12000): return SentenceEncoder(encoder_path, max_sentences=None, max_tokens=max_tokens, cpu=False)
from embed import SentenceEncoder from scipy import spatial from text_processing import TokenLine, BPEfastApplyLine encoder = SentenceEncoder("/LASER/models/bilstm.93langs.2018-12-26.pt", cpu=True, max_sentences=None, max_tokens=12000, sort_kind="mergesort") sentences = [{ "lang": "en", "content": "While the question of hamsters powering homes may seem a bit farcical, it should be noted that at one point humans did specifically breed a certain type of dog for the sole purpose of it just walking along at a steady pace on a giant wheel… (See our article The Curious Tale of Turnspit Dogs.)", }, { "lang": "zh", "content": "靠仓鼠产电功能这个问题似乎有些可笑,但却前有古人,人类曾经靠驱使狗狗跑轮子来烧炉子..." }] def pre_process(sentence, lang): # tokenrize sentence = TokenLine(sentence, lang=lang, romanize=True if lang == 'el' else False, lower_case=True) # bpe return BPEfastApplyLine(sentence, bpe_codes="/LASER/models/93langs.fcodes")
parser.add_argument( '--lang', '-L', nargs='+', default=None, help="List of languages to test on") parser.add_argument('--buffer-size', type=int, default=10000, help='Buffer size (sentences)') parser.add_argument('--max-tokens', type=int, default=12000, help='Maximum number of tokens to process in a batch') parser.add_argument('--max-sentences', type=int, default=None, help='Maximum number of sentences to process in a batch') parser.add_argument('--cpu', action='store_true', help='Use CPU instead of GPU') args = parser.parse_args() print('LASER: embedding single sentence') ''' if __name__ == '__main__': model_dir = LASER + "/models" encoder = model_dir + "/bilstm.93langs.2018-12-26.pt" bpe_codes = model_dir + "/93langs.fcodes" enc = SentenceEncoder(encoder, cpu=True) loaded_bpe=BPEfastLoad('',bpe_codes) line = 'Testing to encode line' print("Embedding line", line) embedded = embedLine(line,enc,loaded_bpe) print("Finished Embedding") print("Embedded line len=", len(embedded))