Exemple #1
0
def extract(encoder,
            token_lang,
            bpe_codes,
            ifname,
            output,
            remove=False,
            verbose=False):
    with tempfile.TemporaryDirectory() as tmpdir:
        #        ifname = ''
        if token_lang != '--':
            tok_fname = os.path.join(tmpdir, 'tok')
            Token(ifname,
                  tok_fname,
                  lang=token_lang,
                  romanize=True if token_lang == 'el' else False,
                  lower_case=True,
                  gzip=False,
                  verbose=verbose,
                  over_write=False)
            ifname = tok_fname
        if bpe_codes:
            bpe_fname = os.path.join(tmpdir, 'bpe')
            BPEfastApply(ifname,
                         bpe_fname,
                         bpe_codes,
                         verbose=verbose,
                         over_write=True)
            ifname = bpe_fname
        EncodeFile(encoder,
                   ifname,
                   output,
                   verbose=verbose,
                   over_write=False,
                   buffer_size=10000)
        return EmbedLoad(output)
Exemple #2
0
def Embed(tmpdir, ifname, encoder, token_lang, bpe_codes, buffer_size,
          verbose):
    output = os.path.join(tmpdir, 'emb')

    if token_lang != '--':
        tok_fname = os.path.join(tmpdir, 'tok')
        Token(ifname,
              tok_fname,
              lang=token_lang,
              romanize=True if token_lang == 'el' else False,
              lower_case=True,
              gzip=False,
              verbose=verbose,
              over_write=False)
        ifname = tok_fname

    if bpe_codes:
        bpe_fname = os.path.join(tmpdir, 'bpe')
        BPEfastApply(ifname,
                     bpe_fname,
                     bpe_codes,
                     verbose=verbose,
                     over_write=False)
        ifname = bpe_fname

    EncodeFile(encoder,
               ifname,
               output,
               verbose=verbose,
               over_write=False,
               buffer_size=buffer_size)

    return output
Exemple #3
0
    def _bpe(self,
             src_input="../tmp/src.tok",
             tgt_input="../tmp/tgt.tok",
             src_tmp="../tmp/src.bpe",
             tgt_tmp="../tmp/tgt.bpe"):

        BPEfastApply(src_input,
                     src_tmp,
                     self.bpe_codes,
                     verbose=True,
                     over_write=False)

        BPEfastApply(tgt_input,
                     tgt_tmp,
                     self.bpe_codes,
                     verbose=True,
                     over_write=False)
Exemple #4
0
 def encode_file(input_filepath, output_filepath, language, bpe_codes_path):
     tokenized_filepath = get_temp_filepath()
     Token(str(input_filepath),
           str(tokenized_filepath),
           lang=language,
           romanize=True if language == 'el' else False)
     BPEfastApply(str(tokenized_filepath), str(output_filepath),
                  str(bpe_codes_path))
     tokenized_filepath.unlink()
Exemple #5
0
    def launch(self, lang):
        self.args.lang = lang

        all_data = []
        all_index = []
        for l in self.args.lang:
            Token(os.path.join(self.args.base_dir, self.args.data + '.' + l),
                  os.path.join(self.args.base_dir,
                               self.args.output + '.tok.' + l),
                  lang=l,
                  romanize=True if l == 'el' else False,
                  lower_case=True,
                  verbose=self.args.verbose,
                  over_write=False)
            BPEfastApply(os.path.join(self.args.base_dir,
                                      self.args.output + '.tok.' + l),
                         os.path.join(self.args.base_dir,
                                      self.args.output + '.bpe.' + l),
                         self.args.bpe_codes,
                         verbose=self.args.verbose,
                         over_write=False)
            EncodeFile(self.enc,
                       os.path.join(self.args.base_dir,
                                    self.args.output + '.bpe.' + l),
                       os.path.join(self.args.base_dir,
                                    self.args.output + '.enc.' + l),
                       verbose=self.args.verbose,
                       over_write=False)
            d, idx = IndexCreate(os.path.join(self.args.base_dir,
                                              self.args.output + '.enc.' + l),
                                 'FlatL2',
                                 verbose=self.args.verbose,
                                 save_index=False)
            all_data.append(d)
            all_index.append(idx)

        distances, indexes, cosine = IndexSearchMultiple(all_data,
                                                         all_index,
                                                         texts=all_texts,
                                                         verbose=True,
                                                         print_errors=False)

        print('D', distances)
        print('I', indexes)
        print('cosine', cosine)

        return distances, indexes, cosine
Exemple #6
0
    print(' - creating directory {}'.format(out_dir))
    os.mkdir(out_dir)

all_data = []
all_index = []
for l in args.lang:
    Token(os.path.join(args.base_dir, args.data + '.' + l),
          os.path.join(args.base_dir, args.output + '.tok.' + l),
          lang=l,
          romanize=True if l == 'el' else False,
          lower_case=True,
          verbose=args.verbose,
          over_write=False)
    BPEfastApply(os.path.join(args.base_dir, args.output + '.tok.' + l),
                 os.path.join(args.base_dir, args.output + '.bpe.' + l),
                 args.bpe_codes,
                 verbose=args.verbose,
                 over_write=False)
    EncodeFile(enc,
               os.path.join(args.base_dir, args.output + '.bpe.' + l),
               os.path.join(args.base_dir, args.output + '.enc.' + l),
               verbose=args.verbose,
               over_write=False)
    d, idx = IndexCreate(os.path.join(args.base_dir,
                                      args.output + '.enc.' + l),
                         'FlatL2',
                         verbose=args.verbose,
                         save_index=False)
    all_data.append(d)
    all_index.append(idx)
Exemple #7
0
    if args.token_lang != '--':
        ifile = os.path.join(tmpdir, 'tok')
        Token(args.input,
              ifile,
              lang=args.token_lang,
              romanize=True if args.token_lang == 'el' else False,
              lower_case=True,
              gzip=False,
              verbose=args.verbose,
              over_write=False)

    if args.bpe_codes:
        bpe_file = os.path.join(tmpdir, 'bpe')
        BPEfastApply(ifile,
                     bpe_file,
                     args.bpe_codes,
                     verbose=args.verbose,
                     over_write=False)
        ifile = bpe_file

    print(' - processing (batch size is {:d})'.format(args.buffer_size))
    ifp = open(ifile, 'r')
    ofp = open(args.output, 'w')
    stats = namedtuple('stats', 'ns np')
    stats.nbs = 0
    stats.nbp = 0
    t = time.time()
    for sentences in buffered_read(ifp, args.buffer_size):
        embed = params.enc.encode_sentences(sentences)
        faiss.normalize_L2(embed)
        # call function for selected margin method
Exemple #8
0
enc = EncodeLoad(args)

print('\nProcessing:')
for part in ('train1000', 'dev', 'test'):
    # for lang in "en" if part == 'train1000' else args.lang:
    for lang in args.lang:
        cfname = os.path.join(args.data_dir, 'mldoc.' + part)
        Token(cfname + '.txt.' + lang,
              cfname + '.tok.' + lang,
              lang=lang,
              romanize=(True if lang == 'el' else False),
              lower_case=True,
              gzip=False,
              verbose=args.verbose,
              over_write=False)
        SplitLines(cfname + '.tok.' + lang, cfname + '.split.' + lang,
                   cfname + '.sid.' + lang)
        BPEfastApply(cfname + '.split.' + lang,
                     cfname + '.split.bpe.' + lang,
                     args.bpe_codes,
                     verbose=args.verbose,
                     over_write=False)
        EncodeFile(enc,
                   cfname + '.split.bpe.' + lang,
                   cfname + '.split.enc.' + lang,
                   verbose=args.verbose,
                   over_write=False,
                   buffer_size=args.buffer_size)
        JoinEmbed(cfname + '.split.enc.' + lang, cfname + '.sid.' + lang,
                  cfname + '.enc.' + lang)
    def _vectorize(self, docs):
        """
        Function for encoding senteces using the LASER model. Code was adapted from 
        
        Arguments:
        docs: the documents to encode, an iterable
        lang: the language to encode
        """
        embedding = ''
        if self.lang is None or not self.lang:
            lang = "en"
            print("Warning: using default language English")
        else:
            lang = self.lang

        # encoder

        model_dir = os.environ.get('LASER') + "models"
        encoder_path = model_dir + "/" + "bilstm.93langs.2018-12-26.pt"
        bpe_codes_path = model_dir + "/" + "93langs.fcodes"
        print(f' - Encoder: loading {encoder_path}')
        encoder = SentenceEncoder(encoder_path,
                                  max_sentences=None,
                                  max_tokens=12000,
                                  sort_kind='mergesort',
                                  cpu=True)
        with tempfile.TemporaryDirectory() as tmp:
            tmpdir = Path(tmp)

            bpe_fname = tmpdir / 'bpe'
            bpe_oname = tmpdir / 'out.raw'

            temp_infile = tmpdir / 'temp_in_docs.txt'
            np.savetxt(temp_infile, docs, fmt="%s")

            if lang != '--':
                tok_fname = tmpdir / "tok"
                Token(str(temp_infile),
                      str(tok_fname),
                      lang=lang,
                      romanize=True if lang == 'el' else False,
                      lower_case=True,
                      gzip=False,
                      verbose=True,
                      over_write=False)
                ifname = tok_fname

            BPEfastApply(str(ifname),
                         str(bpe_fname),
                         str(bpe_codes_path),
                         verbose=True,
                         over_write=False)
            ifname = bpe_fname
            EncodeFile(encoder,
                       str(ifname),
                       str(bpe_oname),
                       verbose=True,
                       over_write=False,
                       buffer_size=10000)
            dim = 1024
            X = np.fromfile(str(bpe_oname), dtype=np.float32, count=-1)
            X.resize(X.shape[0] // dim, dim)
            embedding = X

            return X
Exemple #10
0
def bpe(bpecodes, inputF, outputF, verbose):
    BPEfastApply(inputF,
                 outputF,
                 bpecodes,
                 verbose=verbose,
                 over_write=False)
Exemple #11
0
LANGUAGE_CODE = 'en'
VERBOSE = True

input_file = 'data/test_sentences.txt'
tokenized_f = 'data/test_tokenized.txt'
bpe_f = 'data/test_bpe.txt'
# tokenize
Token(
    input_file,
    tokenized_f,
    lang=LANGUAGE_CODE,
    romanize=False,  #kept static for simplicity
    lower_case=True,
    gzip=False,
    verbose=VERBOSE,
    over_write=False)

# BPE
BPEfastApply(tokenized_f, bpe_f, BPE_CODES, verbose=VERBOSE, over_write=False)

############################################################
# Load + infer model
############################################################
model_path = LASER + '/models/bilstm.93langs.2018-12-26.pt'
model = LASEREmbedderIV(model_path, LASERHiddenExtractor, 300, 100, 10)
bpe_to_idx = torch.load(model_path)['dictionary']

tokens = torch.LongTensor([[1, 2, 3], [4, 5, 6], [6, 7, 8], [7, 8, 9]])
embeddings = model(tokens)
print(embeddings.size())