def __init__(self): self.modelpath = os.path.join(LASER, "models/bilstm.93langs.2018-12-26.pt") # 不需要设置batch_size 根据max_tokens自动调整 self.encoder = SentenceEncoder(self.modelpath, max_sentences=None, max_tokens=12000, sort_kind='quicksort', cpu=False) self.batch_size = 64
def loadEncoder(encoderF, buffer_size, max_tokens, max_sentences=None, cpu=False, stable=False): buffer_size = max(buffer_size, 1) assert not max_sentences or max_sentences <= buffer_size, '--max-sentences/--batch-size ' \ 'cannot be larger than --buffer-size' logger.info(' - Encoder: loading {} - cpu={}'.format(encoderF, cpu)) return SentenceEncoder(encoderF, max_sentences=max_sentences, max_tokens=max_tokens, sort_kind='mergesort' if stable else 'quicksort', cpu=cpu)
def loadEncoder(encoderF, buffer_size, max_tokens, max_sentences=None, cpu=False, stable=False): buffer_size = max(buffer_size, 1) assert not max_sentences or max_sentences <= buffer_size, ( "--max-sentences/--batch-size " "cannot be larger than --buffer-size") logger.info(" - Encoder: loading {} - cpu={}".format(encoderF, cpu)) return SentenceEncoder( encoderF, max_sentences=max_sentences, max_tokens=max_tokens, sort_kind="mergesort" if stable else "quicksort", cpu=cpu, )
def mine(src, tgt, offset, slang, tlang, token_slang, token_tlang, encoder, bpe_codes, buffer_size, max_tokens, max_sentences, enc_cpu, encoding, mode, neighborhood, margin, retrieval, unify, knn_gpu, stable, dim, threshold, verbose, output): buffer_size = max(buffer_size, 1) assert not max_sentences or max_sentences <= buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' if verbose: print(' - Encoder: loading {}'.format(encoder)) encoder = SentenceEncoder(encoder, max_sentences=max_sentences, max_tokens=max_tokens, sort_kind='mergesort' if stable else 'quicksort', cpu=enc_cpu) if offset: src_sents = [s.strip() for s in open(src)] tgt_sents = [t.strip() for t in open(tgt)] # The doc-index starts at 1 for use with the -n option of sed doc_offset = [(int(d[0]), d[1], d[2], int(d[3]), d[4], d[5]) for d in [line.strip().split() for line in open(offset)]] with tempfile.TemporaryDirectory() as tmpdir: tmpdir_path = Path(tmpdir) src_tmpdir_path = tmpdir_path / slang tgt_tmpdir_path = tmpdir_path / tlang src_tmpdir_path.mkdir() tgt_tmpdir_path.mkdir() for s_ind, s_off, s_len, t_ind, t_off, t_len in doc_offset: src_txt = src_tmpdir_path / 'txt' tgt_txt = tgt_tmpdir_path / 'txt' with open(src_txt, "w") as fw: print("\n".join(src_sents), file=fw) with open(tgt_txt, "w") as fw: print("\n".join(tgt_sents), file=fw) src_embeddings = Embed(src_tmpdir_path.__str__(), src_txt.__str__(), encoder, slang if token_slang else "--", bpe_codes, buffer_size, verbose) tgt_embeddings = Embed(tgt_tmpdir_path.__str__(), tgt_txt.__str__(), encoder, tlang if token_tlang else "--", bpe_codes, buffer_size, verbose) # mine_output = tmpdir_path / "mine" Mine( s_ind, t_ind, src_txt.__str__(), tgt_txt.__str__(), encoding, src_embeddings, tgt_embeddings, output, # mine_output.__str__(), unify, mode, retrieval, margin, neighborhood, knn_gpu, dim, threshold, verbose) else: src_embeddings = Embed(src, slang if token_slang else "--") src_embeddings = Embed(tgt, tlang if token_tlang else "--")
def generate_encoder(encoder_file): return SentenceEncoder(encoder_file, max_sentences=None, max_tokens=12000, sort_kind='quicksort', cpu=True)
class LaserHelper: def __init__(self): self.modelpath = os.path.join(LASER, "models/bilstm.93langs.2018-12-26.pt") # 不需要设置batch_size 根据max_tokens自动调整 self.encoder = SentenceEncoder(self.modelpath, max_sentences=None, max_tokens=12000, sort_kind='quicksort', cpu=False) self.batch_size = 64 def _cosine(self, vectors_1, vectors_2): ''' :param vector1s:是一个二维的numpy数组 :param vector2s:是一个二维的numpy数组 :return: 返回的是一个list ''' molecule = np.sum(vectors_1 * vectors_2, axis=1) denominator_1 = np.linalg.norm(vectors_1, axis=1) denominator_2 = np.linalg.norm(vectors_2, axis=1) denominator = denominator_1 * denominator_2 scores = molecule / denominator return scores def _getSensvec(self, sens): ''' :param sens: 是一个句子的列表 :return: 返回的是每个句子的的句子向量 ''' res = self.encoder.encode_sentences(sens) return res def _calCosine(self, sens_1, sens_2): senvecs_1 = self._getSensvec(sens_1) senvecs_2 = self._getSensvec(sens_2) scores = self._cosine(senvecs_1, senvecs_2) return scores def _calL2(self, sens_1, sens_2): senvecs_1 = self._getSensvec(sens_1) senvecs_2 = self._getSensvec(sens_2) scores = np.sqrt(np.sum(np.square(senvecs_1 - senvecs_2), axis=1)) return scores def calL2andSave(self, src_tmp="../tmp/src.bpe", tgt_tmp="../tmp/tgt.bpe", outputfile="../src_tgt_scores_laser.txt"): fr_src = open(src_tmp, mode="r", encoding="utf-8") fr_tgt = open(tgt_tmp, mode="r", encoding="utf-8") fw = open(outputfile, mode="w", encoding="utf-8") sens_1 = [] sens_2 = [] for src, tgt in zip(fr_src, fr_tgt): src = src.strip() tgt = tgt.strip() assert src != "" assert tgt != "" if len(sens_1) == self.batch_size: scores = self._calL2(sens_1, sens_2) for s1, s2, s in zip(sens_1, sens_2, scores): s1 = s1.replace("@@", "").replace(" ", "") s2 = s2.replace("@@", "").replace(" ", "") newline = "---xhm---".join([s1, s2, str(s)]) + "\n" fw.write(newline) sens_1 = [] sens_2 = [] sens_1.append(src) sens_2.append(tgt) if len(sens_1) > 0: scores = self._calL2(sens_1, sens_2) for s1, s2, s in zip(sens_1, sens_2, scores): s1 = s1.replace("@@", "").replace(" ", "") s2 = s2.replace("@@", "").replace(" ", "") newline = "---xhm---".join([s1, s2, str(s)]) + "\n" fw.write(newline) fr_tgt.close() fr_src.close() fw.close() def extractParaphrasePair(self, inputfile="../src_tgt_scores_laser.txt", outputfile="../src_tgt_", thresold=0.3): outputfile = outputfile + str(thresold) + ".txt" fw = open(outputfile, mode="w", encoding="utf-8") with open(inputfile, mode="r", encoding="utf-8") as fr: for line in fr: line = line.strip() if line != "": sen1, sen2, score = line.split("---xhm---") if float(score) < thresold: fw.write("---xhm--".join([sen1, sen2]) + "\n") fw.close()
def get_laser_encoder(encoder_path, max_tokens=12000): return SentenceEncoder(encoder_path, max_sentences=None, max_tokens=max_tokens, cpu=False)
from embed import SentenceEncoder from scipy import spatial from text_processing import TokenLine, BPEfastApplyLine encoder = SentenceEncoder("/LASER/models/bilstm.93langs.2018-12-26.pt", cpu=True, max_sentences=None, max_tokens=12000, sort_kind="mergesort") sentences = [{ "lang": "en", "content": "While the question of hamsters powering homes may seem a bit farcical, it should be noted that at one point humans did specifically breed a certain type of dog for the sole purpose of it just walking along at a steady pace on a giant wheel… (See our article The Curious Tale of Turnspit Dogs.)", }, { "lang": "zh", "content": "靠仓鼠产电功能这个问题似乎有些可笑,但却前有古人,人类曾经靠驱使狗狗跑轮子来烧炉子..." }] def pre_process(sentence, lang): # tokenrize sentence = TokenLine(sentence, lang=lang, romanize=True if lang == 'el' else False, lower_case=True) # bpe return BPEfastApplyLine(sentence, bpe_codes="/LASER/models/93langs.fcodes")
parser.add_argument( '--lang', '-L', nargs='+', default=None, help="List of languages to test on") parser.add_argument('--buffer-size', type=int, default=10000, help='Buffer size (sentences)') parser.add_argument('--max-tokens', type=int, default=12000, help='Maximum number of tokens to process in a batch') parser.add_argument('--max-sentences', type=int, default=None, help='Maximum number of sentences to process in a batch') parser.add_argument('--cpu', action='store_true', help='Use CPU instead of GPU') args = parser.parse_args() print('LASER: embedding single sentence') ''' if __name__ == '__main__': model_dir = LASER + "/models" encoder = model_dir + "/bilstm.93langs.2018-12-26.pt" bpe_codes = model_dir + "/93langs.fcodes" enc = SentenceEncoder(encoder, cpu=True) loaded_bpe=BPEfastLoad('',bpe_codes) line = 'Testing to encode line' print("Embedding line", line) embedded = embedLine(line,enc,loaded_bpe) print("Finished Embedding") print("Embedded line len=", len(embedded))