Python Tokenizer.Tokenizer Examples, tokenizer.tokenizer.Tokenizer.Tokenizer Python Examples

Example #1

0

Show file

File: preprocess.py Project: Oxford-LINKS-NLP/clinical_notes_ICD9

def main(args):
    if os.path.exists(args.output_file):
        raise FileExistsError('File exists: {}'.format(args.output_file))
    csv_reader = pd.read_csv(args.input_file,
                             chunksize=args.batch_size,
                             usecols=[
                                 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE',
                                 'CATEGORY', 'DESCRIPTION', 'ISERROR', 'TEXT'
                             ],
                             dtype={
                                 'SUBJECT_ID': int32,
                                 'HADM_ID': 'str',
                                 'CATEGORY': 'str',
                                 'DESCRIPTION': 'str',
                                 'ISERROR': 'str',
                                 'TEXT': 'str'
                             },
                             keep_default_na=False,
                             na_values='')

    with jsonlines.open(args.output_file, 'w') as notes_tokenized_file:
        tokenizer = Tokenizer(args.batch_size, args.n_cpus, args.n_threads,
                              MODE)

        for i, notes_batch in enumerate(csv_reader):
            process_batch(notes_batch, i, tokenizer, notes_tokenized_file)

Example #2

0

Show file

File: corpus.py Project: Unipisa/biaffine-parser

    def load(cls, path, fields, tokenizer_lang, tokenizer_dir, use_gpu, verbose=True, max_sent_length=math.inf):
        tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, use_gpu=use_gpu, verbose=True)

        sentences = []
        fields = [field if field is not None else Field(str(i))
                  for i, field in enumerate(fields)]

        with open(path, 'r') as f:
            lines = []
            for line in tokenizer.format(tokenizer.predict(f.read())):
                line = line.strip()
                if not line:
                    if len(lines) > max_sent_length:
                        logger.info('Discarded sentence longer than max_sent_length:',
                              len(lines), file=sys.stderr)
                        lines = []
                        continue
                    sentences.append(Sentence(fields, lines))
                    lines = []
                else:
                    if not line.startswith('#'):
                        # append fake columns
                        line = '{}\t{}'.format(line, '\t'.join(['_' for i in range(len(CoNLL._fields) - len(line.split('\t')))]))
                        assert len(CoNLL._fields) == len(line.split('\t')), '{} - {} vs {}'.format(line, len(CoNLL._fields), len(line.split()))
                    lines.append(line)

        return cls(fields, sentences)

Example #3

0

Show file

    def __init__(self,
                 NLG_param_dir,
                 NLG_model_fname,
                 tokenizer,
                 NLU_param_dir=None,
                 NLU_model_fname=None):
        self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model')
        self.tokenizer_mode = tokenizer
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        saved_data = torch.load(
            NLG_param_dir.rstrip('/') + '/' + NLG_model_fname)
        self.model_NLG = saved_data['model']
        f = open(NLG_param_dir.rstrip('/') + '/dictionary.json',
                 'r',
                 encoding='utf-8')
        self.dictionary = json.load(f)
        f.close()

        # beam-search settings
        self.n_beam = 5

        # NLU
        if (NLU_param_dir is not None) and (NLU_model_fname is not None):
            self.NLU = NLU(NLU_param_dir, NLU_model_fname, tokenizer)
        else:
            self.NLU = None

Example #4

0

Show file

def initialize_tokenizer(vocabulary_path, is_bpe=True):
    """Initialize vocabulary from file.

    We assume the vocabulary is stored one-item-per-line, so a file:
      dog
      cat
    will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
    also return the reversed-vocabulary ["dog", "cat"].

    Args:
      vocabulary_path: path to the file containing the vocabulary.

    Returns:
      a pair: the vocabulary (a dictionary mapping string to integers), and
      the reversed vocabulary (a list, which reverses the vocabulary mapping).

    Raises:
      ValueError: if the provided vocabulary_path does not exist.
    """
    if is_bpe:
        return bpe_tokenizer
    if os.path.isfile(vocabulary_path):
        rev_vocab = []
        with open(vocabulary_path, mode="r") as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.strip() for line in rev_vocab]
        return Tokenizer(_UNK, vocab_list=rev_vocab)
    else:
        raise ValueError("Vocabulary file %s not found.", vocabulary_path)

Example #5

0

Show file

File: test_tokenizer.py Project: Unipisa/biaffine-parser

    def test_corpus_load(self):
        tokenizer = Tokenizer(**self.args)
        raw_text_file = '/project/piqasso/Collection/IWPT20/train-dev/UD_Italian-ISDT/it_isdt-ud-dev.txt'

        with open(raw_text_file) as fin:
            for line in tokenizer.format(tokenizer.predict(fin.read())):
                if line and not line.startswith('#'):
                    assert len(line.split('\t')) == 2, line

Example #6

0

Show file

File: data.py Project: wang1ang/embedding-sample-code

 def get_embedding_fn(self, max_length=12):
     self.max_length = max_length
     self.s = Simplifier('tokenizer/zh_mapping.txt')
     self.t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model',
         'tokenizer/lg.all.voc',
         max_length
     )
     self.max_length = max_length
     return self.embedding

Example #7

0

Show file

    def test_corpus_load(self):
        tokenizer = Tokenizer(**self.args)
        sin = io.StringIO(
            "Un corazziere contro Scalfaro. L'attore le disse baciami o torno a riprendermelo."
        )

        for line in tokenizer.format(tokenizer.predict(sin.read())):
            if line and not line.startswith('#'):
                assert len(line.split('\t')) == 10, line

Example #8

0

Show file

    def predict(self,
                data,
                pred=None,
                buckets=8,
                batch_size=5000,
                prob=False,
                **kwargs):
        r"""
        Parses the data and produces a parse tree for each sentence.
        Args:
            data (str or list[list[str]]): input to be parsed: either
                  - a str, that will be tokenized first with the tokenizer for the parser language
                  - a path to a file to be read, either in CoNLL-U format or in plain text if :param text: is supplied.
                  - a list of lists of tokens
            text (str): optional, specifies that the input data is in plain text in the specified language code.
            pred (str or file): a path to a file where to write the parsed input in CoNLL-U fprmat.
            bucket (int): the number of buckets used to group sentences to parallelize matrix computations.
            batch_size (int): group sentences in batches.
            prob (bool): whther to return also probabilities for each arc.
        Return:
            a Dataset containing the parsed sentence trees.
        """
        args = self.args.update(locals())
        init_logger(logger, verbose=args.verbose)

        self.transform.eval()
        if args.prob:
            self.transform.append(Field('probs'))

        if isinstance(data, str) and (not conll_format(data) or args.text):
            self.transform.reader = Tokenizer(args.text,
                                              dir=args.cache_dir,
                                              verbose=args.verbose).reader()

        logger.info("Loading the data")
        dataset = Dataset(self.transform, data)
        dataset.build(args.batch_size, args.buckets)
        logger.info(f"\n{dataset}")

        logger.info("Making predictions on the dataset")
        start = datetime.now()
        preds = self._predict(dataset.loader)
        elapsed = datetime.now() - start

        for name, value in preds.items():
            setattr(dataset, name, value)
        if pred is not None and is_master():
            logger.info(f"Saving predicted results to {pred}")
            self.transform.save(pred, dataset.sentences)
        logger.info(
            f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s"
        )

        return dataset

Example #9

0

Show file

File: nlu.py Project: KSKTYM/NLchain

 def __init__(self, param_dir, model_fname, tokenizer):
     self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model')
     self.tokenizer_mode = tokenizer
     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     saved_data = torch.load(param_dir.rstrip('/')+'/'+model_fname)
     self.model = saved_data['model']
     f = open(param_dir.rstrip('/')+'/dictionary.json', 'r', encoding='utf-8')
     self.dictionary = json.load(f)
     f.close()
     # beam-search settings
     self.n_beam = 5

Example #10

0

Show file

File: test_tokenizer.py Project: Unipisa/biaffine-parser

 def test_download_resources(self):
     self.assertTrue(not os.path.exists(self.MODEL_DIR))
     tokenizer = Tokenizer(**self.args)
     self.assertTrue(
         os.path.exists(self.args['dir'])
         and not os.path.isfile(self.args['dir']))
     self.assertTrue(
         os.path.exists(os.path.join(self.args['dir'], self.args['lang'])))
     self.assertTrue(
         os.path.exists(
             os.path.join(self.args['dir'], self.args['lang'], 'tokenize')))

Example #11

0

Show file

File: test_tokenizer.py Project: Unipisa/diaparser

    def test_download_resources(self):
        tokenizer = Tokenizer(self.args['lang'])

        self.assertTrue(os.path.isdir(self.MODEL_DIR))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.MODEL_DIR, 'tokenizer', self.args['lang'])))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.MODEL_DIR, 'tokenizer', self.args['lang'],
                             'tokenize')))

Example #12

0

Show file

def main():
    dataset = data_loader.load_text_file("data_2.txt")
    tokenizer = Tokenizer()
    separated = tokenizer.tokenize([dataset])
    morfeusz = MorfeuszWrapperLexeme()
    for sentence in separated:
        analysed = morfeusz.analyse([w for w, tag in sentence])
        print(analysed)
        for word, analysis in analysed.items():
            print("{}:".format(word))
            print_analysis(analysis)
        print()

Example #13

0

Show file

File: ident_list.py Project: kmack3/Algae

def tokenize(path):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    idents = []
    for token in tokens:
        if token.kind.name == "IDENTIFIER":
            name = token.spelling.lower()
            name = re.sub("_", "", name)
            idents.append(name)

    return "\n".join(idents)

Example #14

0

Show file

def main():
    text = 'Charakteryzował się on ustawieniem zawodników w kształcie piramidy' \
          ' – bramkarz - 2 obrońców - 3 pomocników - 5 napastników (1-2-3-5).'
    morfeusz = MorfeuszWrapperLexeme()
    tokenizer = Tokenizer()
    text = tokenizer.tokenize([text])
    for sen in text:
        analysed = morfeusz.analyse([w for w, tag in sen], as_xml=False)
        print(analysed)

        analysed = morfeusz.analyse([w for w, tag in sen], as_xml=True)
        print(analysed)
        print()

Example #15

0

Show file

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if self.args.feat in ('char', 'bert'):
            self.WORD, self.FEAT = self.transform.FORM
        else:
            self.WORD, self.FEAT = self.transform.FORM, self.transform.CPOS
        self.ARC, self.REL = self.transform.HEAD, self.transform.DEPREL
        self.puncts = torch.tensor([
            i for s, i in self.WORD.vocab.stoi.items() if ispunct(s)
        ]).to(self.args.device)
        if getattr(self.args, 'text', None):
            self.transform.reader = Tokenizer(self.args.text,
                                              self.args.cache_dir).reader()

Example #16

0

Show file

def test_ner(crf, test_sent):
    from tokenizer.tokenizer import Tokenizer
    token = Tokenizer()
    token.run()
    arr_featurized_sent = []
    postaged_sent = ViPosTagger.postagging(token.predict(test_sent))
    print postaged_sent
    test_arr = []
    for i in xrange(len(postaged_sent[0])):
        test_arr.append((postaged_sent[0][i], postaged_sent[1][i]))
    print test_arr
    featurized_sent = sent2features(test_arr)
    arr_featurized_sent.append(featurized_sent)
    predict = crf.predict(arr_featurized_sent)
    return zip(test_arr, predict[0])

Example #17

0

Show file

    def __init__(self, normalized= True, classes= None, stemmed= True):
        if classes is None:
            classes = ["positive", "negative", "notr"]

        self.x = []
        self.y = []
        self.tokenizer = Tokenizer()
        self.stemmer = TurkishStemmer()
        self.word2vec = None

        self.cachefile = "data/data" + ("_normalized" if normalized else "") + ("_stemmed" if stemmed else "") + "_" + ("_".join(classes)) + ".pickle"
        if os.path.isfile(self.cachefile):
            with open(self.cachefile, 'rb') as cache:
                self.x, self.y = pickle.load(cache)
        else:
            for cls in classes:
                self._append_data(cls, normalized, stemmed)

            with open(self.cachefile, 'wb') as cache:
                pickle.dump((self.x, self.y), cache)

Example #18

0

Show file

File: stats_tokens.py Project: nguyentuc/vnspellingfixer

def first_stats():
    tokenizer = Tokenizer()
    tokenizer.run()
    question_vocabulary = Vocabulary()

    questions = load_questions()
    cc = 0
    for question in questions:
        #print question
        if cc % 10 == 0:
            print "\r%s" % cc,
        cc += 1
        sen = tokenizer.predict(question)
        sen = sen.lower()
        tokens = question_vocabulary.get_sentence_token_ids(sen)
        question_list.append(tokens)
    print "\n Saving..."
    question_vocabulary.save(Q_VOCAB_NAME)
    utils.pickle_save(question_list, "question_tokens.dat")

    print "Done"

Example #19

0

Show file

File: literals.py Project: kmack3/Algae

def tokenize(path):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    items = []
    for token in tokens:
        if token.kind.name == "LITERAL":
            text = token.spelling
            cursor_kind = clang.cindex.CursorKind
            kind = token.cursor.kind

            if kind == cursor_kind.STRING_LITERAL:
                # do extra processing on strings
                text = sha256(mangle_text(token.spelling)).hexdigest()[:10]

            items.append(text)

        if token.kind.name == "COMMENT":
            hashed = sha256(mangle_text(token.spelling[2:])).hexdigest()[:10]
            items.append(hashed)

    return "\n".join(items)

Example #20

0

Show file

File: corpus.py Project: baoduy123/diaparser

    def load(cls,
             path,
             fields,
             tokenizer_lang,
             tokenizer_dir,
             verbose=True,
             max_sent_length=math.inf):
        tokenizer = Tokenizer(lang=tokenizer_lang,
                              dir=tokenizer_dir,
                              verbose=verbose)

        sentences = []
        fields = [
            field if field is not None else Field(str(i))
            for i, field in enumerate(fields)
        ]

        with open(path, 'r') as f:
            lines = []
            for line in tokenizer.format(tokenizer.predict(f.read())):
                line = line.strip()
                if not line:
                    if len(lines) > max_sent_length:
                        logger.info(
                            'Discarded sentence longer than max_sent_length:',
                            len(lines),
                            file=sys.stderr)
                        lines = []
                        continue
                    sentences.append(Sentence(fields, lines))
                    lines = []
                else:
                    if not line.startswith('#'):
                        # append empty columns
                        line += '\t_' * (len(CoNLL._fields) -
                                         len(line.split('\t')))
                    lines.append(line)

        return cls(fields, sentences)

Example #21

0

Show file

    def predict(self,
                X,
                part_of_speech=None,
                tagger_preprocessed=False,
                sentence_level=False):

        i = 0
        if sentence_level:
            results = []
            for text in X:
                tokenizer = Tokenizer()
                sentences = tokenizer.tokenize([text])
                sentences = [
                    " ".join([token[0] for token in sentence])
                    for sentence in sentences
                ]

                preprocessed_sentences = self.preprocess_texts(
                    sentences,
                    part_of_speech=part_of_speech,
                    tagger_preprocessed=tagger_preprocessed)

                X = self.vectorizer.transform(preprocessed_sentences).toarray()
                pred = self.nb_model.predict(X)
                results.append(int(round(np.mean(pred))))
                print(i)
                i += 1
            return np.array(results)

        else:
            preprocessed = self.preprocess_texts(
                X,
                part_of_speech=part_of_speech,
                tagger_preprocessed=tagger_preprocessed)
            X = self.vectorizer.transform(preprocessed).toarray()
            return self.nb_model.predict(X)

Example #22

0

Show file

File: m_training_NLaug.py Project: KSKTYM/NLchain

    print(' tokenizer algorithm     : '+str(args.tokenizer))
    if args.v is True:
        print(' verbose (print debug)   : ON')

    # output directory
    if not os.path.exists(args.p):
        os.mkdir(args.p)

    # (0) torch settings
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # (1) tokenizer setting
    tokenizer = Tokenizer(args.tokenizer, '../tokenizer/e2e.model')

    # (2) corpus data
    random.seed(args.seed)
    lex_flag = True
    dataset_train = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json',
                              args.corpus.rstrip('/')+'/e2e_valid.json',
                              args.corpus.rstrip('/')+'/e2e_test.json',
                              args.corpus.rstrip('/')+'/e2e-augment/e2e_mr_lex_max_num_token.json',
                              'train', tokenizer, lex_flag, device)
    dataset_valid = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json',
                              args.corpus.rstrip('/')+'/e2e_valid.json',
                              args.corpus.rstrip('/')+'/e2e_test.json',
                              args.corpus.rstrip('/')+'/e2e-augment/e2e_mr_lex_max_num_token.json',
                              'valid', tokenizer, lex_flag, device)
    dataset_test = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json',

Example #23

0

Show file

File: s_word18.py Project: trangnt08/question_classification_fptshop

# -*- encoding: utf8 -*-
import re
import requests
import unicodedata
from tokenizer.tokenizer import Tokenizer
from sklearn.externals import joblib
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from pyvi.pyvi import ViTokenizer
from sklearn.metrics import confusion_matrix

tokenizer = Tokenizer()
tokenizer.run()


def load_model(model):
    print('loading model ...', model)
    if os.path.isfile(model):
        return joblib.load(model)
    else:
        return None


def list_words(mes):
    words = mes.lower().split()
    return " ".join(words)

Example #24

0

Show file

File: tokenizer_example.py Project: lukaszsus/nlp_classical_methods_sentiment_public

def main():
    dataset = data_loader.load_text_file("data_2.txt")
    tokenizer = Tokenizer()
    output = tokenizer.tokenize([dataset])
    for sentence in output:
        print(sentence)

Example #25

0

Show file

tf.import_graph_def(restored_graph_def,
                    input_map=None,
                    return_elements=None,
                    name="")

graph = tf.get_default_graph()
doc_ids = graph.get_tensor_by_name('doc_ids:0')
doc_mask = graph.get_tensor_by_name('doc_mask:0')
doc_type = graph.get_tensor_by_name('doc_type:0')
#content = graph.get_tensor_by_name('content:0')
doc_output = graph.get_tensor_by_name('doc/output:0')

doc_max_length = 12

s = Simplifier('tokenizer/zh_mapping.txt')
t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model',
              'tokenizer/lg.all.voc', doc_max_length)
count = 0
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
#f = ['a real test', 'b false test']
with open(infile, 'r', encoding='utf-8') as f, open(outfile,
                                                    'w',
                                                    encoding='utf-8') as fo:
    with tf.Session(config=config) as sess:
        time = datetime.datetime.now()
        for line in f:
            simple = s.simplify(line)
            tokens = t.tokenize(simple)
            accents = run_strip_accents(tokens)
            ids = t.token_to_id(accents)

Example #26

0

Show file

File: test_tokenizer.py Project: Unipisa/diaparser

 def test_tokenize(self):
     tokenizer = Tokenizer(self.args['lang'])
     sentences = tokenizer.predict(
         'Ha chiamato il dr. Rossi.Vuole salutarti.')
     self.assertEqual(len(sentences), 2)

Example #27

0

Show file

File: test_tokenizer.py Project: Unipisa/biaffine-parser

 def test_tokenize(self):
     tokenizer = Tokenizer(**self.args)
     sentences = tokenizer.predict(
         'Domani vorrei andare al mare.Speriamo faccia bel tempo.')
     self.assertEqual(len(sentences), 2)

Example #28

0

Show file

def genStats(path, helpers):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    # stats
    numLines = 0
    numWhitespace = 0
    numComments = 0
    avgIdentLength = 0
    numFunctions = 0  # ident followed by (, declarations and calls
    numDefines = 0
    numMathOps = 0
    lenLongestLine = 0
    numReturns = 0

    # other data
    idents = []
    text = io.readFile(path)
    lastWasIdent = False

    # get info from tokens
    for token in tokens:
        # look for a comment
        if token.kind.name == "COMMENT":
            numComments += 1

        # look for math ops
        if token.spelling in [
                "+", "-", "*", "/", "|", "&", "+=", "-=", "*=", "/=", ">>=",
                "<<=", "++", "--", "~", ">>", "!"
        ]:
            numMathOps += 1

        # look for function decs/calls
        if lastWasIdent and token.spelling == "(":
            numFunctions += 1

        # count the number of returns
        if token.spelling == "return":
            numReturns += 1

        # add the identifier to the list, set lastWasIdent
        if token.kind.name == "IDENTIFIER":
            idents.append(token.spelling)
            lastWasIdent = True
        else:
            lastWasIdent = False

    # get average ident length
    total = 0.0
    for ident in idents:
        total += float(len(ident))
    avgIdentLength = 0.0
    if len(idents) > 0:
        avgIdentLenth = total / float(len(idents))

    # find the number of defines
    defines = re.findall("#\s*define ", text.lower())
    numDefines = len(defines)

    # find the number of lines
    lines = text.split("\n")
    if len(lines) == 1:
        # ugh, windows
        lines = text.split("\r")
    numLines = len(lines)

    # get the length of the longest line
    for line in lines:
        if len(line) > lenLongestLine:
            lenLongestLine = len(line)

    # find the total amount of whitespace
    for char in text:
        if char in [" ", "\n", "\t", "\r"]:
            numWhitespace += 1

    # create a dict of results and return
    results = {}
    results["numLines"] = numLines
    results["numWhitespace"] = numWhitespace
    results["numComments"] = numComments
    results["avgIdentLength"] = avgIdentLength
    results["numFunctions"] = numFunctions
    results["numDefines"] = numDefines
    results["numMathOps"] = numMathOps
    results["numReturns"] = numReturns
    results["lenLongestLine"] = lenLongestLine
    return results

Example #29

0

Show file

File: m_nlg.py Project: KSKTYM/NLchain

    print('startword: ' + startword)

    print('** output **')
    # MR(LEX) -> TXT_LEX
    nlg_txt, attention_txt = NLG_model.convert_nlg(mr_obj, args.search,
                                                   lex_flag, startword)
    obj_txt = {'txt': nlg_txt}
    print('TXT: ' + obj_txt['txt'])

    #print(obj_txt)
    f = open(args.o, 'w', encoding='utf-8')
    json.dump(obj_txt, f, ensure_ascii=False, indent=4, sort_keys=False)
    f.close()
    print('** done **')

    tokenizer = Tokenizer('nltk', '../../tokenizer/e2e.model')

    txt = nlg_txt
    mr = mr_obj
    if read_mr_obj['name'] != '':
        txt = txt.replace(read_mr_obj['name'], 'NAME')
        mr['name'] = 'NAME'
    if read_mr_obj['near'] != '':
        txt = txt.replace(read_mr_obj['near'], 'NEAR')
        mr['near'] = 'NEAR'

    mr_token = tokenizer.mr(mr)
    txt_token = tokenizer.txt(txt)
    '''
    print(mr_token)
    print(str(len(mr_token)))

Example #30

0

Show file

import pickle
from bm25 import BM25

# Special vocabulary symbols - we always put them at the start.
_PAD = b"_PAD"
_GO = b"_GO"
_EOS = b"_EOS"
_UNK = b"_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

default_tokenizer = Tokenizer(_UNK)
bpe_tokenizer = BPETokenizer(
    open("/home/martin/projects/subword-nmt/vocab_bpe_merged"), _START_VOCAB)


def create_vocabulary(vocabulary_path,
                      data_path,
                      max_vocabulary_size,
                      dataset_reader,
                      tokenizer=default_tokenizer,
                      persist_counts=False):
    """Create vocabulary file (if it does not exist yet) from data file.

    Data file is assumed to contain one sentence per line. Each sentence is
    tokenized and digits are normalized (if normalize_digits is set).
    Vocabulary contains the most-frequent tokens up to max_vocabulary_size.