Beispiel #1
0
def Embed(tmpdir, ifname, encoder, token_lang, bpe_codes, buffer_size,
          verbose):
    output = os.path.join(tmpdir, 'emb')

    if token_lang != '--':
        tok_fname = os.path.join(tmpdir, 'tok')
        Token(ifname,
              tok_fname,
              lang=token_lang,
              romanize=True if token_lang == 'el' else False,
              lower_case=True,
              gzip=False,
              verbose=verbose,
              over_write=False)
        ifname = tok_fname

    if bpe_codes:
        bpe_fname = os.path.join(tmpdir, 'bpe')
        BPEfastApply(ifname,
                     bpe_fname,
                     bpe_codes,
                     verbose=verbose,
                     over_write=False)
        ifname = bpe_fname

    EncodeFile(encoder,
               ifname,
               output,
               verbose=verbose,
               over_write=False,
               buffer_size=buffer_size)

    return output
Beispiel #2
0
def extract(encoder,
            token_lang,
            bpe_codes,
            ifname,
            output,
            remove=False,
            verbose=False):
    with tempfile.TemporaryDirectory() as tmpdir:
        #        ifname = ''
        if token_lang != '--':
            tok_fname = os.path.join(tmpdir, 'tok')
            Token(ifname,
                  tok_fname,
                  lang=token_lang,
                  romanize=True if token_lang == 'el' else False,
                  lower_case=True,
                  gzip=False,
                  verbose=verbose,
                  over_write=False)
            ifname = tok_fname
        if bpe_codes:
            bpe_fname = os.path.join(tmpdir, 'bpe')
            BPEfastApply(ifname,
                         bpe_fname,
                         bpe_codes,
                         verbose=verbose,
                         over_write=True)
            ifname = bpe_fname
        EncodeFile(encoder,
                   ifname,
                   output,
                   verbose=verbose,
                   over_write=False,
                   buffer_size=10000)
        return EmbedLoad(output)
Beispiel #3
0
def emb(encoder, inputF, outputF, verbose, buffer_size):
    EncodeFile(encoder,
               inputF,
               outputF,
               verbose=verbose,
               over_write=False,
               buffer_size=buffer_size)
Beispiel #4
0
    def launch(self, lang):
        self.args.lang = lang

        all_data = []
        all_index = []
        for l in self.args.lang:
            Token(os.path.join(self.args.base_dir, self.args.data + '.' + l),
                  os.path.join(self.args.base_dir,
                               self.args.output + '.tok.' + l),
                  lang=l,
                  romanize=True if l == 'el' else False,
                  lower_case=True,
                  verbose=self.args.verbose,
                  over_write=False)
            BPEfastApply(os.path.join(self.args.base_dir,
                                      self.args.output + '.tok.' + l),
                         os.path.join(self.args.base_dir,
                                      self.args.output + '.bpe.' + l),
                         self.args.bpe_codes,
                         verbose=self.args.verbose,
                         over_write=False)
            EncodeFile(self.enc,
                       os.path.join(self.args.base_dir,
                                    self.args.output + '.bpe.' + l),
                       os.path.join(self.args.base_dir,
                                    self.args.output + '.enc.' + l),
                       verbose=self.args.verbose,
                       over_write=False)
            d, idx = IndexCreate(os.path.join(self.args.base_dir,
                                              self.args.output + '.enc.' + l),
                                 'FlatL2',
                                 verbose=self.args.verbose,
                                 save_index=False)
            all_data.append(d)
            all_index.append(idx)

        distances, indexes, cosine = IndexSearchMultiple(all_data,
                                                         all_index,
                                                         texts=all_texts,
                                                         verbose=True,
                                                         print_errors=False)

        print('D', distances)
        print('I', indexes)
        print('cosine', cosine)

        return distances, indexes, cosine
Beispiel #5
0
all_data = []
all_index = []
for l in args.lang:
    Token(os.path.join(args.base_dir, args.data + '.' + l),
          os.path.join(args.base_dir, args.output + '.tok.' + l),
          lang=l,
          romanize=True if l == 'el' else False,
          lower_case=True,
          verbose=args.verbose,
          over_write=False)
    BPEfastApply(os.path.join(args.base_dir, args.output + '.tok.' + l),
                 os.path.join(args.base_dir, args.output + '.bpe.' + l),
                 args.bpe_codes,
                 verbose=args.verbose,
                 over_write=False)
    EncodeFile(enc,
               os.path.join(args.base_dir, args.output + '.bpe.' + l),
               os.path.join(args.base_dir, args.output + '.enc.' + l),
               verbose=args.verbose,
               over_write=False)
    d, idx = IndexCreate(os.path.join(args.base_dir,
                                      args.output + '.enc.' + l),
                         'FlatL2',
                         verbose=args.verbose,
                         save_index=False)
    all_data.append(d)
    all_index.append(idx)

err = IndexSearchMultiple(all_data, all_index, verbose=False)
IndexPrintConfusionMatrix(err, args.lang)
Beispiel #6
0
enc = EncodeLoad(args)

print('\nProcessing:')
for part in ('train1000', 'dev', 'test'):
    # for lang in "en" if part == 'train1000' else args.lang:
    for lang in args.lang:
        cfname = os.path.join(args.data_dir, 'mldoc.' + part)
        Token(cfname + '.txt.' + lang,
              cfname + '.tok.' + lang,
              lang=lang,
              romanize=(True if lang == 'el' else False),
              lower_case=True,
              gzip=False,
              verbose=args.verbose,
              over_write=False)
        SplitLines(cfname + '.tok.' + lang, cfname + '.split.' + lang,
                   cfname + '.sid.' + lang)
        BPEfastApply(cfname + '.split.' + lang,
                     cfname + '.split.bpe.' + lang,
                     args.bpe_codes,
                     verbose=args.verbose,
                     over_write=False)
        EncodeFile(enc,
                   cfname + '.split.bpe.' + lang,
                   cfname + '.split.enc.' + lang,
                   verbose=args.verbose,
                   over_write=False,
                   buffer_size=args.buffer_size)
        JoinEmbed(cfname + '.split.enc.' + lang, cfname + '.sid.' + lang,
                  cfname + '.enc.' + lang)