Example #1
0
    def __init__(self,
                 NLG_param_dir,
                 NLG_model_fname,
                 tokenizer,
                 NLU_param_dir=None,
                 NLU_model_fname=None):
        self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model')
        self.tokenizer_mode = tokenizer
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        saved_data = torch.load(
            NLG_param_dir.rstrip('/') + '/' + NLG_model_fname)
        self.model_NLG = saved_data['model']
        f = open(NLG_param_dir.rstrip('/') + '/dictionary.json',
                 'r',
                 encoding='utf-8')
        self.dictionary = json.load(f)
        f.close()

        # beam-search settings
        self.n_beam = 5

        # NLU
        if (NLU_param_dir is not None) and (NLU_model_fname is not None):
            self.NLU = NLU(NLU_param_dir, NLU_model_fname, tokenizer)
        else:
            self.NLU = None
Example #2
0
    def load(cls, path, fields, tokenizer_lang, tokenizer_dir, use_gpu, verbose=True, max_sent_length=math.inf):
        tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, use_gpu=use_gpu, verbose=True)

        sentences = []
        fields = [field if field is not None else Field(str(i))
                  for i, field in enumerate(fields)]

        with open(path, 'r') as f:
            lines = []
            for line in tokenizer.format(tokenizer.predict(f.read())):
                line = line.strip()
                if not line:
                    if len(lines) > max_sent_length:
                        logger.info('Discarded sentence longer than max_sent_length:',
                              len(lines), file=sys.stderr)
                        lines = []
                        continue
                    sentences.append(Sentence(fields, lines))
                    lines = []
                else:
                    if not line.startswith('#'):
                        # append fake columns
                        line = '{}\t{}'.format(line, '\t'.join(['_' for i in range(len(CoNLL._fields) - len(line.split('\t')))]))
                        assert len(CoNLL._fields) == len(line.split('\t')), '{} - {} vs {}'.format(line, len(CoNLL._fields), len(line.split()))
                    lines.append(line)

        return cls(fields, sentences)
    def test_corpus_load(self):
        tokenizer = Tokenizer(**self.args)
        raw_text_file = '/project/piqasso/Collection/IWPT20/train-dev/UD_Italian-ISDT/it_isdt-ud-dev.txt'

        with open(raw_text_file) as fin:
            for line in tokenizer.format(tokenizer.predict(fin.read())):
                if line and not line.startswith('#'):
                    assert len(line.split('\t')) == 2, line
Example #4
0
 def get_embedding_fn(self, max_length=12):
     self.max_length = max_length
     self.s = Simplifier('tokenizer/zh_mapping.txt')
     self.t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model',
         'tokenizer/lg.all.voc',
         max_length
     )
     self.max_length = max_length
     return self.embedding
Example #5
0
    def test_corpus_load(self):
        tokenizer = Tokenizer(**self.args)
        sin = io.StringIO(
            "Un corazziere contro Scalfaro. L'attore le disse baciami o torno a riprendermelo."
        )

        for line in tokenizer.format(tokenizer.predict(sin.read())):
            if line and not line.startswith('#'):
                assert len(line.split('\t')) == 10, line
Example #6
0
 def __init__(self, param_dir, model_fname, tokenizer):
     self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model')
     self.tokenizer_mode = tokenizer
     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     saved_data = torch.load(param_dir.rstrip('/')+'/'+model_fname)
     self.model = saved_data['model']
     f = open(param_dir.rstrip('/')+'/dictionary.json', 'r', encoding='utf-8')
     self.dictionary = json.load(f)
     f.close()
     # beam-search settings
     self.n_beam = 5
Example #7
0
def tokenize(path):
	t = Tokenizer(path)
	tokens = t.raw_tokenize()

	idents = []
	for token in tokens:
		if token.kind.name == "IDENTIFIER":
			name = token.spelling.lower()
			name = re.sub("_", "", name)
			idents.append(name)

	return "\n".join(idents)
Example #8
0
def tokenize(path):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    idents = []
    for token in tokens:
        if token.kind.name == "IDENTIFIER":
            name = token.spelling.lower()
            name = re.sub("_", "", name)
            idents.append(name)

    return "\n".join(idents)
Example #9
0
def main():
    dataset = data_loader.load_text_file("data_2.txt")
    tokenizer = Tokenizer()
    separated = tokenizer.tokenize([dataset])
    morfeusz = MorfeuszWrapperLexeme()
    for sentence in separated:
        analysed = morfeusz.analyse([w for w, tag in sentence])
        print(analysed)
        for word, analysis in analysed.items():
            print("{}:".format(word))
            print_analysis(analysis)
        print()
Example #10
0
def main():
    text = 'Charakteryzował się on ustawieniem zawodników w kształcie piramidy' \
          ' – bramkarz - 2 obrońców - 3 pomocników - 5 napastników (1-2-3-5).'
    morfeusz = MorfeuszWrapperLexeme()
    tokenizer = Tokenizer()
    text = tokenizer.tokenize([text])
    for sen in text:
        analysed = morfeusz.analyse([w for w, tag in sen], as_xml=False)
        print(analysed)

        analysed = morfeusz.analyse([w for w, tag in sen], as_xml=True)
        print(analysed)
        print()
Example #11
0
def test_ner(crf, test_sent):
    from tokenizer.tokenizer import Tokenizer
    token = Tokenizer()
    token.run()
    arr_featurized_sent = []
    postaged_sent = ViPosTagger.postagging(token.predict(test_sent))
    print postaged_sent
    test_arr = []
    for i in xrange(len(postaged_sent[0])):
        test_arr.append((postaged_sent[0][i], postaged_sent[1][i]))
    print test_arr
    featurized_sent = sent2features(test_arr)
    arr_featurized_sent.append(featurized_sent)
    predict = crf.predict(arr_featurized_sent)
    return zip(test_arr, predict[0])
Example #12
0
def initialize_tokenizer(vocabulary_path, is_bpe=True):
    """Initialize vocabulary from file.

    We assume the vocabulary is stored one-item-per-line, so a file:
      dog
      cat
    will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
    also return the reversed-vocabulary ["dog", "cat"].

    Args:
      vocabulary_path: path to the file containing the vocabulary.

    Returns:
      a pair: the vocabulary (a dictionary mapping string to integers), and
      the reversed vocabulary (a list, which reverses the vocabulary mapping).

    Raises:
      ValueError: if the provided vocabulary_path does not exist.
    """
    if is_bpe:
        return bpe_tokenizer
    if os.path.isfile(vocabulary_path):
        rev_vocab = []
        with open(vocabulary_path, mode="r") as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.strip() for line in rev_vocab]
        return Tokenizer(_UNK, vocab_list=rev_vocab)
    else:
        raise ValueError("Vocabulary file %s not found.", vocabulary_path)
    def on_button_parse(self):
        self.tz = None
        self.tokensTable.delete(0, END)
        self.variableTable.delete(0, END)
        self.constantTable.delete(0, END)
        self.errorsTable.delete(0, END)
        source_code = self.inputField.get('1.0', 'end').split('\n')

        for i in range(len(source_code)):
            source_code[i] += '\n'

        # analyzing of input file
        self.tz = Tokenizer(source_to_analyze=source_code)
        try:
            self.tz = self.tz.analyze()
            self.errorsTable.insert(END, "OK")
        except IndexError:
            pass
        except Exception as e:
            self.errorsTable.insert(END, e)

        for token in self.tz['tokens']:
            self.tokensTable.insert(END, token)

        for i in self.tz['variables']:
            self.variableTable.insert(END, i)

        for i in self.tz['constants']:
            self.constantTable.insert(END, i)
def main(args):
    if os.path.exists(args.output_file):
        raise FileExistsError('File exists: {}'.format(args.output_file))
    csv_reader = pd.read_csv(args.input_file,
                             chunksize=args.batch_size,
                             usecols=[
                                 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE',
                                 'CATEGORY', 'DESCRIPTION', 'ISERROR', 'TEXT'
                             ],
                             dtype={
                                 'SUBJECT_ID': int32,
                                 'HADM_ID': 'str',
                                 'CATEGORY': 'str',
                                 'DESCRIPTION': 'str',
                                 'ISERROR': 'str',
                                 'TEXT': 'str'
                             },
                             keep_default_na=False,
                             na_values='')

    with jsonlines.open(args.output_file, 'w') as notes_tokenized_file:
        tokenizer = Tokenizer(args.batch_size, args.n_cpus, args.n_threads,
                              MODE)

        for i, notes_batch in enumerate(csv_reader):
            process_batch(notes_batch, i, tokenizer, notes_tokenized_file)
Example #15
0
    def predict(self,
                data,
                pred=None,
                buckets=8,
                batch_size=5000,
                prob=False,
                **kwargs):
        r"""
        Parses the data and produces a parse tree for each sentence.
        Args:
            data (str or list[list[str]]): input to be parsed: either
                  - a str, that will be tokenized first with the tokenizer for the parser language
                  - a path to a file to be read, either in CoNLL-U format or in plain text if :param text: is supplied.
                  - a list of lists of tokens
            text (str): optional, specifies that the input data is in plain text in the specified language code.
            pred (str or file): a path to a file where to write the parsed input in CoNLL-U fprmat.
            bucket (int): the number of buckets used to group sentences to parallelize matrix computations.
            batch_size (int): group sentences in batches.
            prob (bool): whther to return also probabilities for each arc.
        Return:
            a Dataset containing the parsed sentence trees.
        """
        args = self.args.update(locals())
        init_logger(logger, verbose=args.verbose)

        self.transform.eval()
        if args.prob:
            self.transform.append(Field('probs'))

        if isinstance(data, str) and (not conll_format(data) or args.text):
            self.transform.reader = Tokenizer(args.text,
                                              dir=args.cache_dir,
                                              verbose=args.verbose).reader()

        logger.info("Loading the data")
        dataset = Dataset(self.transform, data)
        dataset.build(args.batch_size, args.buckets)
        logger.info(f"\n{dataset}")

        logger.info("Making predictions on the dataset")
        start = datetime.now()
        preds = self._predict(dataset.loader)
        elapsed = datetime.now() - start

        for name, value in preds.items():
            setattr(dataset, name, value)
        if pred is not None and is_master():
            logger.info(f"Saving predicted results to {pred}")
            self.transform.save(pred, dataset.sentences)
        logger.info(
            f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s"
        )

        return dataset
Example #16
0
    def test_download_resources(self):
        tokenizer = Tokenizer(self.args['lang'])

        self.assertTrue(os.path.isdir(self.MODEL_DIR))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.MODEL_DIR, 'tokenizer', self.args['lang'])))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.MODEL_DIR, 'tokenizer', self.args['lang'],
                             'tokenize')))
Example #17
0
 def test_download_resources(self):
     self.assertTrue(not os.path.exists(self.MODEL_DIR))
     tokenizer = Tokenizer(**self.args)
     self.assertTrue(
         os.path.exists(self.args['dir'])
         and not os.path.isfile(self.args['dir']))
     self.assertTrue(
         os.path.exists(os.path.join(self.args['dir'], self.args['lang'])))
     self.assertTrue(
         os.path.exists(
             os.path.join(self.args['dir'], self.args['lang'], 'tokenize')))
Example #18
0
    def __init__(self, normalized= True, classes= None, stemmed= True):
        if classes is None:
            classes = ["positive", "negative", "notr"]

        self.x = []
        self.y = []
        self.tokenizer = Tokenizer()
        self.stemmer = TurkishStemmer()
        self.word2vec = None

        self.cachefile = "data/data" + ("_normalized" if normalized else "") + ("_stemmed" if stemmed else "") + "_" + ("_".join(classes)) + ".pickle"
        if os.path.isfile(self.cachefile):
            with open(self.cachefile, 'rb') as cache:
                self.x, self.y = pickle.load(cache)
        else:
            for cls in classes:
                self._append_data(cls, normalized, stemmed)

            with open(self.cachefile, 'wb') as cache:
                pickle.dump((self.x, self.y), cache)
Example #19
0
def first_stats():
    tokenizer = Tokenizer()
    tokenizer.run()
    question_vocabulary = Vocabulary()

    questions = load_questions()
    cc = 0
    for question in questions:
        #print question
        if cc % 10 == 0:
            print "\r%s" % cc,
        cc += 1
        sen = tokenizer.predict(question)
        sen = sen.lower()
        tokens = question_vocabulary.get_sentence_token_ids(sen)
        question_list.append(tokens)
    print "\n Saving..."
    question_vocabulary.save(Q_VOCAB_NAME)
    utils.pickle_save(question_list, "question_tokens.dat")

    print "Done"
Example #20
0
def tokenize(path):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    items = []
    for token in tokens:
        if token.kind.name == "LITERAL":
            text = token.spelling
            cursor_kind = clang.cindex.CursorKind
            kind = token.cursor.kind

            if kind == cursor_kind.STRING_LITERAL:
                # do extra processing on strings
                text = sha256(mangle_text(token.spelling)).hexdigest()[:10]

            items.append(text)

        if token.kind.name == "COMMENT":
            hashed = sha256(mangle_text(token.spelling[2:])).hexdigest()[:10]
            items.append(hashed)

    return "\n".join(items)
Example #21
0
    def load(cls,
             path,
             fields,
             tokenizer_lang,
             tokenizer_dir,
             verbose=True,
             max_sent_length=math.inf):
        tokenizer = Tokenizer(lang=tokenizer_lang,
                              dir=tokenizer_dir,
                              verbose=verbose)

        sentences = []
        fields = [
            field if field is not None else Field(str(i))
            for i, field in enumerate(fields)
        ]

        with open(path, 'r') as f:
            lines = []
            for line in tokenizer.format(tokenizer.predict(f.read())):
                line = line.strip()
                if not line:
                    if len(lines) > max_sent_length:
                        logger.info(
                            'Discarded sentence longer than max_sent_length:',
                            len(lines),
                            file=sys.stderr)
                        lines = []
                        continue
                    sentences.append(Sentence(fields, lines))
                    lines = []
                else:
                    if not line.startswith('#'):
                        # append empty columns
                        line += '\t_' * (len(CoNLL._fields) -
                                         len(line.split('\t')))
                    lines.append(line)

        return cls(fields, sentences)
Example #22
0
def tokenize(path):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    items = []
    for token in tokens:
        if token.kind.name == "LITERAL":
            text = token.spelling
            cursor_kind = clang.cindex.CursorKind
            kind = token.cursor.kind

            if kind == cursor_kind.STRING_LITERAL:
                # do extra processing on strings
                text = sha256(mangle_text(token.spelling)).hexdigest()[:10]

            items.append(text)

        if token.kind.name == "COMMENT":
            hashed = sha256(mangle_text(token.spelling[2:])).hexdigest()[:10]
            items.append(hashed)

    return "\n".join(items)
Example #23
0
class XlmEmbedding(TextEmbedding):
    def __init__(self):
        pass
    def get_embedding_fn(self, max_length=12):
        self.max_length = max_length
        self.s = Simplifier('tokenizer/zh_mapping.txt')
        self.t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model',
            'tokenizer/lg.all.voc',
            max_length
        )
        self.max_length = max_length
        return self.embedding
    def embedding(self, text):
        simple = self.s.simplify(text)
        tokens = self.t.tokenize(simple)
        accents = run_strip_accents(tokens)
        ids = self.t.token_to_id(accents)
        return ids
    def size(self):
        return self.t.dico.counts
    @classmethod
    def get_feeder(cls):
        return DenseDataFeeder
Example #24
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if self.args.feat in ('char', 'bert'):
            self.WORD, self.FEAT = self.transform.FORM
        else:
            self.WORD, self.FEAT = self.transform.FORM, self.transform.CPOS
        self.ARC, self.REL = self.transform.HEAD, self.transform.DEPREL
        self.puncts = torch.tensor([
            i for s, i in self.WORD.vocab.stoi.items() if ispunct(s)
        ]).to(self.args.device)
        if getattr(self.args, 'text', None):
            self.transform.reader = Tokenizer(self.args.text,
                                              self.args.cache_dir).reader()
Example #25
0
    def predict(self,
                X,
                part_of_speech=None,
                tagger_preprocessed=False,
                sentence_level=False):

        i = 0
        if sentence_level:
            results = []
            for text in X:
                tokenizer = Tokenizer()
                sentences = tokenizer.tokenize([text])
                sentences = [
                    " ".join([token[0] for token in sentence])
                    for sentence in sentences
                ]

                preprocessed_sentences = self.preprocess_texts(
                    sentences,
                    part_of_speech=part_of_speech,
                    tagger_preprocessed=tagger_preprocessed)

                X = self.vectorizer.transform(preprocessed_sentences).toarray()
                pred = self.nb_model.predict(X)
                results.append(int(round(np.mean(pred))))
                print(i)
                i += 1
            return np.array(results)

        else:
            preprocessed = self.preprocess_texts(
                X,
                part_of_speech=part_of_speech,
                tagger_preprocessed=tagger_preprocessed)
            X = self.vectorizer.transform(preprocessed).toarray()
            return self.nb_model.predict(X)
Example #26
0
def genStats(path, helpers):
	t = Tokenizer(path)
	tokens = t.raw_tokenize()

	# stats
	numLines = 0
	numWhitespace = 0
	numComments = 0
	avgIdentLength = 0
	numFunctions = 0 # ident followed by (, declarations and calls
	numDefines = 0
	numMathOps = 0
	lenLongestLine = 0
	numReturns = 0

	# other data
	idents = []
	text = io.readFile(path)
	lastWasIdent = False

	# get info from tokens
	for token in tokens:
		# look for a comment
		if token.kind.name == "COMMENT":
			numComments += 1

		# look for math ops
		if token.spelling in ["+", "-", "*", "/", "|", "&", "+=", "-=", "*=", "/=", ">>=", "<<=", "++", "--", "~", ">>", "!"]:
			numMathOps += 1

		# look for function decs/calls
		if lastWasIdent and token.spelling == "(":
			numFunctions += 1

		# count the number of returns
		if token.spelling == "return":
			numReturns += 1

		# add the identifier to the list, set lastWasIdent
		if token.kind.name == "IDENTIFIER":
			idents.append(token.spelling)
			lastWasIdent = True
		else:
			lastWasIdent = False

	# get average ident length
	total = 0.0
	for ident in idents:
		total += float(len(ident))
	avgIdentLength = 0.0
	if len(idents) > 0:
		avgIdentLenth = total / float(len(idents))

	# find the number of defines
	defines = re.findall("#\s*define ", text.lower())
	numDefines = len(defines)

	# find the number of lines
	lines = text.split("\n")
	if len(lines) == 1:
		# ugh, windows
		lines = text.split("\r")
	numLines = len(lines)

	# get the length of the longest line
	for line in lines:
		if len(line) > lenLongestLine:
			lenLongestLine = len(line)

	# find the total amount of whitespace
	for char in text:
		if char in [" ", "\n", "\t", "\r"]:
			numWhitespace += 1

	# create a dict of results and return
	results = {}
	results["numLines"] = numLines
	results["numWhitespace"] = numWhitespace
	results["numComments"] = numComments
	results["avgIdentLength"] = avgIdentLength
	results["numFunctions"] = numFunctions
	results["numDefines"] = numDefines
	results["numMathOps"] = numMathOps
	results["numReturns"] = numReturns
	results["lenLongestLine"] = lenLongestLine
	return results
# -*- encoding: utf8 -*-
import re
import requests
import unicodedata
from tokenizer.tokenizer import Tokenizer
from sklearn.externals import joblib
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from pyvi.pyvi import ViTokenizer
from sklearn.metrics import confusion_matrix

tokenizer = Tokenizer()
tokenizer.run()


def load_model(model):
    print('loading model ...', model)
    if os.path.isfile(model):
        return joblib.load(model)
    else:
        return None


def list_words(mes):
    words = mes.lower().split()
    return " ".join(words)

Example #28
0
    print(' tokenizer algorithm     : '+str(args.tokenizer))
    if args.v is True:
        print(' verbose (print debug)   : ON')

    # output directory
    if not os.path.exists(args.p):
        os.mkdir(args.p)

    # (0) torch settings
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # (1) tokenizer setting
    tokenizer = Tokenizer(args.tokenizer, '../tokenizer/e2e.model')

    # (2) corpus data
    random.seed(args.seed)
    lex_flag = True
    dataset_train = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json',
                              args.corpus.rstrip('/')+'/e2e_valid.json',
                              args.corpus.rstrip('/')+'/e2e_test.json',
                              args.corpus.rstrip('/')+'/e2e-augment/e2e_mr_lex_max_num_token.json',
                              'train', tokenizer, lex_flag, device)
    dataset_valid = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json',
                              args.corpus.rstrip('/')+'/e2e_valid.json',
                              args.corpus.rstrip('/')+'/e2e_test.json',
                              args.corpus.rstrip('/')+'/e2e-augment/e2e_mr_lex_max_num_token.json',
                              'valid', tokenizer, lex_flag, device)
    dataset_test = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json',
Example #29
0
tf.import_graph_def(restored_graph_def,
                    input_map=None,
                    return_elements=None,
                    name="")

graph = tf.get_default_graph()
doc_ids = graph.get_tensor_by_name('doc_ids:0')
doc_mask = graph.get_tensor_by_name('doc_mask:0')
doc_type = graph.get_tensor_by_name('doc_type:0')
#content = graph.get_tensor_by_name('content:0')
doc_output = graph.get_tensor_by_name('doc/output:0')

doc_max_length = 12

s = Simplifier('tokenizer/zh_mapping.txt')
t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model',
              'tokenizer/lg.all.voc', doc_max_length)
count = 0
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
#f = ['a real test', 'b false test']
with open(infile, 'r', encoding='utf-8') as f, open(outfile,
                                                    'w',
                                                    encoding='utf-8') as fo:
    with tf.Session(config=config) as sess:
        time = datetime.datetime.now()
        for line in f:
            simple = s.simplify(line)
            tokens = t.tokenize(simple)
            accents = run_strip_accents(tokens)
            ids = t.token_to_id(accents)
Example #30
0
    parser.add_option("-x", "--numPartitions", dest="numPartitions", type="int",
                      help="number of partitions", default=10)
    parser.add_option("-y", "--outputtype", dest="outputtype", type="string",
                      help="output type: csv/json", default="json")
    parser.add_option("-k", "--topk", dest="topk", type="int",
                      help="top n matches", default=3)
    parser.add_option("-z", "--candidatesName", dest="candidates_name", type="string",
                        help="name for json element for matching candidates", default="candidates")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options
    inputFilename = args[0]
    configFilename = args[1]
    outputFilename = args[2]

    tokenizer = Tokenizer(configFilename, c_options)
    if c_options.inputformat == "text":
        rdd = tokenizer.tokenize_text_file(sc, inputFilename, c_options.data_type)
    else:
        rdd = tokenizer.tokenize_seq_file(sc, inputFilename, c_options.data_type)
    rdd.partitionBy(c_options.numPartitions)

    hasher = Hasher(c_options.numHashes, c_options.numItemsInBand, c_options.computeSimilarity)
    input_lsh_rdd = hasher.compute_hashes(rdd)

    clusterer = Clusterer(c_options.numPartitions,
                          c_options.computeSimilarity, c_options.threshold)

    if len(c_options.base) > 0:
        if len(c_options.baseConfig) > 0:
            tokenizer = Tokenizer(c_options.baseConfig, c_options)
import os, sys
from io import open
from tokenizer.tokenizer import Tokenizer
import utils
import unicodedata
import regex
import my_map
from pyvi.pyvi import ViPosTagger

dataset = 'dataset/train'
tokenized_dataset = 'dataset/train_tokenized'
# dataset = 'dataset/test'
# tokenized_dataset = 'dataset/test_tokenized'

tokenizer = Tokenizer()
r = regex.regex()


def tokenizer_dataset():
    utils.mkdir(tokenized_dataset)
    stack = os.listdir(dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print('\r%s' % (file_path)),
            sys.stdout.flush()
Example #32
0
class HobbitGUI(Tk):
    class Text2(Frame):
        def __init__(self, master, width=0, height=0, **kwargs):
            self.width = width
            self.height = height

            Frame.__init__(self, master, width=self.width, height=self.height)
            self.text_widget = Text(self, **kwargs)
            self.text_widget.pack(expand=YES, fill=BOTH)

        def pack(self, *args, **kwargs):
            Frame.pack(self, *args, **kwargs)
            self.pack_propagate(False)

        def grid(self, *args, **kwargs):
            Frame.grid(self, *args, **kwargs)
            self.grid_propagate(False)

    def __init__(self, parent=None):
        Tk.__init__(self, parent)
        self.parent = parent
        self.initialize()
        self.title("Hobbit IDE")
        self.tz = None
        self.log = []

    def initialize(self):
        self.grid()

        spaceTTIF = Label(self)
        spaceTTIF.grid(column=15)
        self.inputField = Text(self)
        self.inputField.grid(column=0, row=0,
                             columnspan=1, rowspan=8, sticky='EW')

        self.inputParamField = Text(self, width=20, height=10)
        self.inputParamField.grid(column=1, row=5,
                                  columnspan=2, rowspan=4, sticky='EW')

        self.tokensTable = Listbox(self, width=10, height=15, font='Courier')
        self.tokensTable.grid(column=3, row=0,
                              columnspan=10, rowspan=4, sticky='EW')
        self.variableTable = Listbox(self, width=10, height=5, font='Courier')
        self.variableTable.grid(column=3, row=5,
                                columnspan=10, rowspan=1, sticky='EW')
        self.constantTable = Listbox(self, width=10, height=5, font='Courier')
        self.constantTable.grid(column=3, row=7,
                                columnspan=10, rowspan=1, sticky='EW')

        self.errorsTable = Listbox(self, width=100, height=10, font='Courier')
        self.errorsTable.grid(column=0, row=9, columnspan=4, sticky="EW")
        self.errorsTable1 = Listbox(self, width=100, height=10, font='Courier')
        self.errorsTable1.grid(column=0, row=10, columnspan=4, sticky="EW")
        self.errorsTable2 = Listbox(self, width=100, height=10, font='Courier')
        self.errorsTable2.grid(column=0, row=11, columnspan=4, sticky="EW")
        self.inputField.focus_set()

        btnParse = Button(self, text="Parse",
                          command=self.on_button_parse)
        btnParse.grid(column=2, row=0)

        btnAnalyze = Button(self, text='Analyze',
                            command=self.analyze)
        btnAnalyze.grid(column=2, row=1)
        btnRun = Button(self, text='POLIZ',
                        command=self.on_button_translate)
        btnRun.grid(column=2, row=2)
        btnRun = Button(self, text='RUN',
                        command=self.run)
        btnRun.grid(column=2, row=3)

    def test(self):
        source_code = self.inputField.get('1.0', 'end').split('\n')
        for i in source_code:
            print(i)

    def analyze(self):
        self.errorsTable1.delete(0, END)
        if self.tz is None:
            self.errorsTable.insert(END, "No data to analyze.")
        try:
            a = OPGAnalyzer(self.tz['tokens'][:-2], grammar=grammar, grammar_elements=grammar_elements)
            self.log = [i for i in a.analyze()]
            for i in self.log:
                print(i)
                self.errorsTable1.insert(END, '| {iteration:3} | {stack:30.30} | {relation:1.1} | {input:20.20} | '
                                              '{rpn:30.30} |'
                                         .format(iteration=i['iteration'],
                                                 stack=str(i['stack']),
                                                 relation=i['relation'],
                                                 input=str(i['input']),
                                                 rpn=str(i['rpn'])))
            self.errorsTable.insert(END, 'OK')
        except Exception as e:
            self.errorsTable.insert(END, 'At line {!s} you have an error in symbol {}'.format(e.args[0].line_number + 1,
                                                                                              e.args[0].name))

    def on_button_parse(self):
        self.tz = None
        self.tokensTable.delete(0, END)
        self.variableTable.delete(0, END)
        self.constantTable.delete(0, END)
        self.errorsTable.delete(0, END)
        source_code = self.inputField.get('1.0', 'end').split('\n')

        for i in range(len(source_code)):
            source_code[i] += '\n'

        # analyzing of input file
        self.tz = Tokenizer(source_to_analyze=source_code)
        try:
            self.tz = self.tz.analyze()
            self.errorsTable.insert(END, "OK")
        except Exception as e:
            self.errorsTable.insert(END, e)

        for token in self.tz['tokens'][:-2]:
            self.tokensTable.insert(END, token)

        for i in self.tz['variables']:
            self.variableTable.insert(END, i)

        for i in self.tz['constants']:
            self.constantTable.insert(END, i)

    def on_button_translate(self):
        self.source = Translator().translate(self.tz['tokens'])
        self.errorsTable2.insert(END, self.source)

    def run(self):
        from hobbit_lib.rpn.executor import execute

        print(self.source)
        execute(self.source)
Example #33
0
 def test_tokenize(self):
     tokenizer = Tokenizer(**self.args)
     sentences = tokenizer.predict(
         'Domani vorrei andare al mare.Speriamo faccia bel tempo.')
     self.assertEqual(len(sentences), 2)
Example #34
0
    parser.add_argument('-model_NLU',
                        help='NLU model file',
                        default='model_038.dat')
    parser.add_argument('-search_NLG',
                        help='NLG search',
                        choices=['best', 'greedy'],
                        default='best')
    parser.add_argument('-tokenizer',
                        help='tokenizer ([nltk]|sentencepiece)',
                        choices=['nltk', 'sentencepiece'],
                        default='nltk')
    args = parser.parse_args()

    print('** generate augmented data **')
    # tokenizer
    tokenizer = Tokenizer(args.tokenizer, '../../tokenizer/e2e.model')

    ##
    ## generate MRaug data
    ##
    print('** generate MR augmented data **')
    # collect MR values
    mr_list = {
        'name': [],
        'eatType': [],
        'food': [],
        'priceRange': [],
        'customer rating': [],
        'area': [],
        'familyFriendly': [],
        'near': []
Example #35
0
class NLG():
    def __init__(self,
                 NLG_param_dir,
                 NLG_model_fname,
                 tokenizer,
                 NLU_param_dir=None,
                 NLU_model_fname=None):
        self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model')
        self.tokenizer_mode = tokenizer
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        saved_data = torch.load(
            NLG_param_dir.rstrip('/') + '/' + NLG_model_fname)
        self.model_NLG = saved_data['model']
        f = open(NLG_param_dir.rstrip('/') + '/dictionary.json',
                 'r',
                 encoding='utf-8')
        self.dictionary = json.load(f)
        f.close()

        # beam-search settings
        self.n_beam = 5

        # NLU
        if (NLU_param_dir is not None) and (NLU_model_fname is not None):
            self.NLU = NLU(NLU_param_dir, NLU_model_fname, tokenizer)
        else:
            self.NLU = None

    def convert_nlg(self, input_mr_obj, search, lex_flag, startword=''):
        def _shape_txt(input_mr_obj, output_token, lex_flag):
            if self.tokenizer_mode == 'sentencepiece':
                output_txt = ''.join(output_token).replace('▁', ' ')
                output_txt = output_txt.lstrip(' ')
            else:
                output_txt = ''
                for i in range(len(output_token)):
                    if (i > 0) and (output_token[i] != '.') and (
                            output_token[i] != ',') and (output_token[i][0] !=
                                                         '\''):
                        output_txt += ' '
                    output_txt += output_token[i]
            # Lexicalisation
            if lex_flag is True:
                output_txt = output_txt.replace('NAME', input_mr_obj['name'])
                output_txt = output_txt.replace('NEAR', input_mr_obj['near'])
            return output_txt

        input_mr_obj_org = copy.deepcopy(input_mr_obj)
        if lex_flag is True:
            if input_mr_obj['name'] != '':
                input_mr_obj['name'] = 'NAME'
            if input_mr_obj['near'] != '':
                input_mr_obj['near'] = 'NEAR'
        input_mr_token = self.tokenizer.mr(input_mr_obj)
        if search == 'greedy':
            output_txt_token, attention = self.translate_nlg_greedy_search(
                input_mr_token, startword)
        elif search == 'beam':
            output_txt_token, attention = self.translate_nlg_beam_search(
                input_mr_token, lex_flag, startword)
        else:
            output_txt_token, attention = self.translate_nlg(
                input_mr_token, lex_flag, startword)
        output_txt = _shape_txt(input_mr_obj_org, output_txt_token, lex_flag)
        return output_txt, attention

    def translate_nlg_encode(self, input_mr_token):
        mr_indexes = []
        for token in input_mr_token:
            if token in self.dictionary['mr_s2i']:
                mr_indexes.append(self.dictionary['mr_s2i'][token])
            else:
                mr_indexes.append(self.dictionary['mr_s2i']['<unk>'])
        mr_tensor = torch.LongTensor(mr_indexes).unsqueeze(0).to(self.device)
        mr_mask = self.model_NLG.make_mr_mask(mr_tensor)
        with torch.no_grad():
            enc_mr = self.model_NLG.encoder(mr_tensor, mr_mask)
        return enc_mr, mr_mask

    def translate_nlg_greedy_search(self, input_mr_token, startword=''):
        self.model_NLG.eval()

        ## encode
        enc_mr, mr_mask = self.translate_nlg_encode(input_mr_token)

        ## decode
        # startword
        token_startword = self.tokenizer.txt(startword)

        txt_indexes = [self.dictionary['txt_s2i']['<sos>']]
        for token in token_startword:
            if token in self.dictionary['txt_s2i']:
                txt_indexes.append(self.dictionary['txt_s2i'][token])
            else:
                txt_indexes.append(self.dictionary['txt_s2i']['<unk>'])

        num_token = len(txt_indexes)
        for i in range(self.dictionary['max_txt_length'] - num_token):
            txt_tensor = torch.LongTensor(txt_indexes).unsqueeze(0).to(
                self.device)
            txt_mask = self.model_NLG.make_txt_mask(txt_tensor)
            with torch.no_grad():
                output, attention = self.model_NLG.decoder(
                    txt_tensor, enc_mr, txt_mask, mr_mask)

            pred_token = output.argmax(2)[:, -1].item()
            txt_indexes.append(pred_token)

            if pred_token == self.dictionary['txt_s2i']['<eos>']:
                break
        txt_tokens = [self.dictionary['txt_i2s'][i] for i in txt_indexes]
        txt_tokens = txt_tokens[1:-1]

        return txt_tokens, attention

    def translate_nlg_beam_search(self,
                                  input_mr_token,
                                  lex_flag,
                                  startword=''):
        self.model_NLG.eval()

        ## encode
        enc_mr, mr_mask = self.translate_nlg_encode(input_mr_token)

        ## decode
        # startword
        token_startword = self.tokenizer.txt(startword)
        offset = len(token_startword)

        a_cand_prev = [{
            'idx': [self.dictionary['txt_s2i']['<sos>']],
            'val': 1.0
        }]
        for token in token_startword:
            if token in self.dictionary['txt_s2i']:
                a_cand_prev[0]['idx'].append(self.dictionary['txt_s2i'][token])
            else:
                a_cand_prev[0]['idx'].append(
                    self.dictionary['txt_s2i']['<unk>'])
        num_token = len(a_cand_prev[0]['idx'])
        a_out = []
        for i in range(self.dictionary['max_txt_length'] - num_token):
            a_cand = []
            for j in range(len(a_cand_prev)):
                txt_tensor = torch.LongTensor(
                    a_cand_prev[j]['idx']).unsqueeze(0).to(self.device)
                txt_mask = self.model_NLG.make_txt_mask(txt_tensor)
                with torch.no_grad():
                    output, attention = self.model_NLG.decoder(
                        txt_tensor, enc_mr, txt_mask, mr_mask)
                    output = torch.softmax(output, dim=-1)
                for n in range(self.n_beam):
                    a_cand.append(copy.deepcopy(a_cand_prev[j]))
                    idx = (torch.argsort(output, axis=2)[0, i + offset,
                                                         -(n + 1)]).item()
                    val = output[0, i + offset, idx].item()
                    a_cand[len(a_cand) - 1]['idx'].append(idx)
                    a_cand[len(a_cand) - 1]['val'] *= val

            a_cand_sort = sorted(a_cand, key=lambda x: x['val'], reverse=True)
            a_cand_prev = []
            nloop = min(len(a_cand_sort), self.n_beam)
            for j in range(nloop):
                if a_cand_sort[j]['idx'][
                        len(a_cand_sort[j]['idx']) -
                        1] == self.dictionary['txt_s2i']['<eos>']:
                    a_out.append(a_cand_sort[j])
                    if len(a_out) == self.n_beam:
                        break
                else:
                    a_cand_prev.append(a_cand_sort[j])
            if len(a_out) == self.n_beam:
                break

        if lex_flag is False:
            ref_mr_token = input_mr_token
        else:
            tmp_mr_text = ''
            for token in input_mr_token:
                tmp_mr_text += token
            tmp_mr_list = tmp_mr_text.split('|')
            if tmp_mr_list[0] != '':
                tmp_mr_list[0] = 'NAME'
            if tmp_mr_list[7] != '':
                tmp_mr_list[7] = 'NEAR'
            tmp_mr_obj = {
                'name': tmp_mr_list[0],
                'eatType': tmp_mr_list[1],
                'food': tmp_mr_list[2],
                'priceRange': tmp_mr_list[3],
                'customer rating': tmp_mr_list[4],
                'area': tmp_mr_list[5],
                'familyFriendly': tmp_mr_list[6],
                'near': tmp_mr_list[7]
            }
            ref_mr_token = self.tokenizer.mr(tmp_mr_obj)

        flag = False
        for n in range(len(a_out)):
            txt_tokens_tmp = [
                self.dictionary['txt_i2s'][idx] for idx in a_out[n]['idx']
            ]
            nlu_output_token, _ = self.NLU.translate_nlu_greedy_search(
                txt_tokens_tmp[1:-1])
            if nlu_output_token == ref_mr_token:
                txt_tokens = txt_tokens_tmp[1:-1]
                flag = True
                break
        if flag is False:
            if len(a_out) > 0:
                txt_tokens = [
                    self.dictionary['txt_i2s'][idx] for idx in a_out[0]['idx']
                ]
                txt_tokens = txt_tokens[1:-1]
            else:
                txt_tokens, attention = self.translate_nlg_greedy_search(
                    input_mr_token, 'single', startword)
        return txt_tokens, attention

    def translate_nlg(self, input_mr_token, lex_flag, startword=''):
        self.model_NLG.eval()

        ## encode
        enc_mr, mr_mask = self.translate_nlg_encode(input_mr_token)

        ## decode
        # startword
        token_startword = self.tokenizer.txt(startword)
        offset = len(token_startword)

        # greedy search
        txt_indexes = [self.dictionary['txt_s2i']['<sos>']]
        for token in token_startword:
            if token in self.dictionary['txt_s2i']:
                txt_indexes.append(self.dictionary['txt_s2i'][token])
            else:
                txt_indexes.append(self.dictionary['txt_s2i']['<unk>'])

        num_token = len(txt_indexes)
        for i in range(self.dictionary['max_txt_length'] - num_token):
            txt_tensor = torch.LongTensor(txt_indexes).unsqueeze(0).to(
                self.device)
            txt_mask = self.model_NLG.make_txt_mask(txt_tensor)

            with torch.no_grad():
                output, attention = self.model_NLG.decoder(
                    txt_tensor, enc_mr, txt_mask, mr_mask)

            pred_token = output.argmax(2)[:, -1].item()
            txt_indexes.append(pred_token)

            if pred_token == self.dictionary['txt_s2i']['<eos>']:
                break
        txt_tokens_greedy = [
            self.dictionary['txt_i2s'][i] for i in txt_indexes
        ]
        attention_greedy = attention

        nlu_output_token, _ = self.NLU.translate_nlu_greedy_search(
            txt_tokens_greedy[1:-1])
        if lex_flag is False:
            ref_mr_token = input_mr_token
        else:
            tmp_mr_text = ''
            for token in input_mr_token:
                tmp_mr_text += token
            tmp_mr_list = tmp_mr_text.split('|')
            if tmp_mr_list[0] != '':
                tmp_mr_list[0] = 'NAME'
            if tmp_mr_list[7] != '':
                tmp_mr_list[7] = 'NEAR'
            tmp_mr_obj = {
                'name': tmp_mr_list[0],
                'eatType': tmp_mr_list[1],
                'food': tmp_mr_list[2],
                'priceRange': tmp_mr_list[3],
                'customer rating': tmp_mr_list[4],
                'area': tmp_mr_list[5],
                'familyFriendly': tmp_mr_list[6],
                'near': tmp_mr_list[7]
            }
            ref_mr_token = self.tokenizer.mr(tmp_mr_obj)

        if nlu_output_token == ref_mr_token:
            txt_tokens = txt_tokens_greedy[1:-1]
            attention = attention_greedy
        else:
            a_cand_prev = [{
                'idx': [self.dictionary['txt_s2i']['<sos>']],
                'val': 1.0
            }]
            for token in token_startword:
                if token in self.dictionary['txt_s2i']:
                    a_cand_prev[0]['idx'].append(
                        self.dictionary['txt_s2i'][token])
                else:
                    a_cand_prev[0]['idx'].append(
                        self.dictionary['txt_s2i']['<unk>'])
            num_token = len(a_cand_prev[0]['idx'])
            a_out = []
            for i in range(self.dictionary['max_txt_length'] - num_token):
                a_cand = []
                for j in range(len(a_cand_prev)):
                    txt_tensor = torch.LongTensor(
                        a_cand_prev[j]['idx']).unsqueeze(0).to(self.device)
                    txt_mask = self.model_NLG.make_txt_mask(txt_tensor)
                    with torch.no_grad():
                        output, attention = self.model_NLG.decoder(
                            txt_tensor, enc_mr, txt_mask, mr_mask)
                        output = torch.softmax(output, dim=-1)
                    for n in range(self.n_beam):
                        a_cand.append(copy.deepcopy(a_cand_prev[j]))
                        idx = (torch.argsort(output, axis=2)[0, i + offset,
                                                             -(n + 1)]).item()
                        val = output[0, i + offset, idx].item()
                        a_cand[len(a_cand) - 1]['idx'].append(idx)
                        a_cand[len(a_cand) - 1]['val'] *= val

                a_cand_sort = sorted(a_cand,
                                     key=lambda x: x['val'],
                                     reverse=True)
                a_cand_prev = []
                nloop = min(len(a_cand_sort), self.n_beam)
                for j in range(nloop):
                    if a_cand_sort[j]['idx'][
                            len(a_cand_sort[j]['idx']) -
                            1] == self.dictionary['txt_s2i']['<eos>']:
                        a_out.append(a_cand_sort[j])
                        if len(a_out) == self.n_beam:
                            break
                    else:
                        a_cand_prev.append(a_cand_sort[j])
                if len(a_out) == self.n_beam:
                    break

            flag = False
            for n in range(len(a_out)):
                txt_tokens_tmp = [
                    self.dictionary['txt_i2s'][idx] for idx in a_out[n]['idx']
                ]
                nlu_output_token, _ = self.NLU.translate_nlu_greedy_search(
                    txt_tokens_tmp[1:-1])
                if nlu_output_token == ref_mr_token:
                    txt_tokens = txt_tokens_tmp[1:-1]
                    flag = True
                    break

            if flag is False:
                txt_tokens = txt_tokens_greedy[1:-1]
                attention = attention_greedy

        return txt_tokens, attention
:licence GPLv2

Good luck and have fun with this simple lexical analyzer for hobbit programming language.
"""

import json
from os import getcwd
from sys import argv
from syntax_analizer.syntax_analyzer import SyntaxAnalyzer
from tokenizer.tokenizer import Tokenizer

try:
    input_file = open(getcwd()+'/'+argv[1])

    # analyzing of input file
    tz = Tokenizer(input_file)
    try:
        tz = tz.analyze()
    except Exception as e:
        print(e)
    input_file.close()

    # writing analyzed data to json file
    try:
        output_file = open(getcwd()+'/'+argv[2], 'w')
    except Exception:
        output_file = open(getcwd()+'/output.json', 'w')
    json.dump({'tokens':       [i.toDict() for i in tz['tokens']],
               'variables':    [i.toDict() for i in tz['variables']],
               'constants':    [i.toDict() for i in tz['constants']]},
              output_file, separators=(',', ':'), indent=4)
class HobbitGUI(Tk):
    def __init__(self, parent=None):
        Tk.__init__(self, parent)
        self.parent = parent
        self.initialize()
        self.title("Hobbit IDE")
        self.tz = None

    def initialize(self):
        self.grid()

        spaceTTIF = Label(self)
        spaceTTIF.grid(column=5)
        self.inputField = Text(self)
        self.inputField.grid(column=0, row=0,
                             columnspan=5, rowspan=14, sticky='EW')

        self.tokensTable = Listbox(self, width=100, height=15, font='Courier')
        self.tokensTable.grid(column=6, row=0,
                              columnspan=10, rowspan=10, sticky='EW')
        self.variableTable = Listbox(self, width=100, height=10, font='Courier')
        self.variableTable.grid(column=6, row=10,
                              columnspan=10, rowspan=7, sticky='EW')
        self.constantTable = Listbox(self, width=100, height=10, font='Courier')
        self.constantTable.grid(column=6, row=17,
                              columnspan=10, rowspan=7, sticky='EW')

        self.errorsTable = Listbox(self, width=70, height=10, font='Courier')
        self.errorsTable.grid(column=0, row=18, columnspan=5, sticky="EW")
        self.inputField.focus_set()

        btnParse = Button(self, text="Parse",
                          command=self.on_button_parse)
        btnParse.grid(column=3, row=17)

        btnAnalyze = Button(self, text='Analyze',
                            command=self.analyze)
        btnAnalyze.grid(column=2, row=17)

        btnAnalyze2 = Button(self, text='Analyze2',
                             command=self.analyze2)
        btnAnalyze2.grid(column=1, row=17)

    def test(self):
        source_code = self.inputField.get('1.0', 'end').split('\n')
        for i in source_code:
            print(i)

    def analyze(self):
        if self.tz is None:
            self.errorsTable.insert(END, "No data to analyze.")
        try:
            a = SyntaxAnalyzer(self.tz)
            a.analyze()
            self.errorsTable.insert(END, 'OK')
        except Exception as e:
            for i in e.__str__().split('\n'):
                self.errorsTable.insert(END, i)

    def analyze2(self):
        if self.tz is None:
            self.errorsTable.insert(END, "No data to analyze.")
        else:
            __analyze_input = []
            for i in self.tz['tokens']:
                __analyze_input.append(i.toDict())
            __analyze_output = syntax_analyser.move(__analyze_input)
            for i in __analyze_output:
                self.errorsTable.insert(END, i)

    def on_button_parse(self):
        self.tz = None
        self.tokensTable.delete(0, END)
        self.variableTable.delete(0, END)
        self.constantTable.delete(0, END)
        self.errorsTable.delete(0, END)
        source_code = self.inputField.get('1.0', 'end').split('\n')

        for i in range(len(source_code)):
            source_code[i] += '\n'

        # analyzing of input file
        self.tz = Tokenizer(source_to_analyze=source_code)
        try:
            self.tz = self.tz.analyze()
            self.errorsTable.insert(END, "OK")
        except IndexError:
            pass
        except Exception as e:
            self.errorsTable.insert(END, e)

        for token in self.tz['tokens']:
            self.tokensTable.insert(END, token)

        for i in self.tz['variables']:
            self.variableTable.insert(END, i)

        for i in self.tz['constants']:
            self.constantTable.insert(END, i)
Example #38
0
import pickle
from bm25 import BM25

# Special vocabulary symbols - we always put them at the start.
_PAD = b"_PAD"
_GO = b"_GO"
_EOS = b"_EOS"
_UNK = b"_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

default_tokenizer = Tokenizer(_UNK)
bpe_tokenizer = BPETokenizer(
    open("/home/martin/projects/subword-nmt/vocab_bpe_merged"), _START_VOCAB)


def create_vocabulary(vocabulary_path,
                      data_path,
                      max_vocabulary_size,
                      dataset_reader,
                      tokenizer=default_tokenizer,
                      persist_counts=False):
    """Create vocabulary file (if it does not exist yet) from data file.

    Data file is assumed to contain one sentence per line. Each sentence is
    tokenized and digits are normalized (if normalize_digits is set).
    Vocabulary contains the most-frequent tokens up to max_vocabulary_size.