def __init__(self, NLG_param_dir, NLG_model_fname, tokenizer, NLU_param_dir=None, NLU_model_fname=None): self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model') self.tokenizer_mode = tokenizer self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') saved_data = torch.load( NLG_param_dir.rstrip('/') + '/' + NLG_model_fname) self.model_NLG = saved_data['model'] f = open(NLG_param_dir.rstrip('/') + '/dictionary.json', 'r', encoding='utf-8') self.dictionary = json.load(f) f.close() # beam-search settings self.n_beam = 5 # NLU if (NLU_param_dir is not None) and (NLU_model_fname is not None): self.NLU = NLU(NLU_param_dir, NLU_model_fname, tokenizer) else: self.NLU = None
def load(cls, path, fields, tokenizer_lang, tokenizer_dir, use_gpu, verbose=True, max_sent_length=math.inf): tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, use_gpu=use_gpu, verbose=True) sentences = [] fields = [field if field is not None else Field(str(i)) for i, field in enumerate(fields)] with open(path, 'r') as f: lines = [] for line in tokenizer.format(tokenizer.predict(f.read())): line = line.strip() if not line: if len(lines) > max_sent_length: logger.info('Discarded sentence longer than max_sent_length:', len(lines), file=sys.stderr) lines = [] continue sentences.append(Sentence(fields, lines)) lines = [] else: if not line.startswith('#'): # append fake columns line = '{}\t{}'.format(line, '\t'.join(['_' for i in range(len(CoNLL._fields) - len(line.split('\t')))])) assert len(CoNLL._fields) == len(line.split('\t')), '{} - {} vs {}'.format(line, len(CoNLL._fields), len(line.split())) lines.append(line) return cls(fields, sentences)
def test_corpus_load(self): tokenizer = Tokenizer(**self.args) raw_text_file = '/project/piqasso/Collection/IWPT20/train-dev/UD_Italian-ISDT/it_isdt-ud-dev.txt' with open(raw_text_file) as fin: for line in tokenizer.format(tokenizer.predict(fin.read())): if line and not line.startswith('#'): assert len(line.split('\t')) == 2, line
def get_embedding_fn(self, max_length=12): self.max_length = max_length self.s = Simplifier('tokenizer/zh_mapping.txt') self.t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model', 'tokenizer/lg.all.voc', max_length ) self.max_length = max_length return self.embedding
def test_corpus_load(self): tokenizer = Tokenizer(**self.args) sin = io.StringIO( "Un corazziere contro Scalfaro. L'attore le disse baciami o torno a riprendermelo." ) for line in tokenizer.format(tokenizer.predict(sin.read())): if line and not line.startswith('#'): assert len(line.split('\t')) == 10, line
def __init__(self, param_dir, model_fname, tokenizer): self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model') self.tokenizer_mode = tokenizer self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') saved_data = torch.load(param_dir.rstrip('/')+'/'+model_fname) self.model = saved_data['model'] f = open(param_dir.rstrip('/')+'/dictionary.json', 'r', encoding='utf-8') self.dictionary = json.load(f) f.close() # beam-search settings self.n_beam = 5
def tokenize(path): t = Tokenizer(path) tokens = t.raw_tokenize() idents = [] for token in tokens: if token.kind.name == "IDENTIFIER": name = token.spelling.lower() name = re.sub("_", "", name) idents.append(name) return "\n".join(idents)
def main(): dataset = data_loader.load_text_file("data_2.txt") tokenizer = Tokenizer() separated = tokenizer.tokenize([dataset]) morfeusz = MorfeuszWrapperLexeme() for sentence in separated: analysed = morfeusz.analyse([w for w, tag in sentence]) print(analysed) for word, analysis in analysed.items(): print("{}:".format(word)) print_analysis(analysis) print()
def main(): text = 'Charakteryzował się on ustawieniem zawodników w kształcie piramidy' \ ' – bramkarz - 2 obrońców - 3 pomocników - 5 napastników (1-2-3-5).' morfeusz = MorfeuszWrapperLexeme() tokenizer = Tokenizer() text = tokenizer.tokenize([text]) for sen in text: analysed = morfeusz.analyse([w for w, tag in sen], as_xml=False) print(analysed) analysed = morfeusz.analyse([w for w, tag in sen], as_xml=True) print(analysed) print()
def test_ner(crf, test_sent): from tokenizer.tokenizer import Tokenizer token = Tokenizer() token.run() arr_featurized_sent = [] postaged_sent = ViPosTagger.postagging(token.predict(test_sent)) print postaged_sent test_arr = [] for i in xrange(len(postaged_sent[0])): test_arr.append((postaged_sent[0][i], postaged_sent[1][i])) print test_arr featurized_sent = sent2features(test_arr) arr_featurized_sent.append(featurized_sent) predict = crf.predict(arr_featurized_sent) return zip(test_arr, predict[0])
def initialize_tokenizer(vocabulary_path, is_bpe=True): """Initialize vocabulary from file. We assume the vocabulary is stored one-item-per-line, so a file: dog cat will result in a vocabulary {"dog": 0, "cat": 1}, and this function will also return the reversed-vocabulary ["dog", "cat"]. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if is_bpe: return bpe_tokenizer if os.path.isfile(vocabulary_path): rev_vocab = [] with open(vocabulary_path, mode="r") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] return Tokenizer(_UNK, vocab_list=rev_vocab) else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def on_button_parse(self): self.tz = None self.tokensTable.delete(0, END) self.variableTable.delete(0, END) self.constantTable.delete(0, END) self.errorsTable.delete(0, END) source_code = self.inputField.get('1.0', 'end').split('\n') for i in range(len(source_code)): source_code[i] += '\n' # analyzing of input file self.tz = Tokenizer(source_to_analyze=source_code) try: self.tz = self.tz.analyze() self.errorsTable.insert(END, "OK") except IndexError: pass except Exception as e: self.errorsTable.insert(END, e) for token in self.tz['tokens']: self.tokensTable.insert(END, token) for i in self.tz['variables']: self.variableTable.insert(END, i) for i in self.tz['constants']: self.constantTable.insert(END, i)
def main(args): if os.path.exists(args.output_file): raise FileExistsError('File exists: {}'.format(args.output_file)) csv_reader = pd.read_csv(args.input_file, chunksize=args.batch_size, usecols=[ 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CATEGORY', 'DESCRIPTION', 'ISERROR', 'TEXT' ], dtype={ 'SUBJECT_ID': int32, 'HADM_ID': 'str', 'CATEGORY': 'str', 'DESCRIPTION': 'str', 'ISERROR': 'str', 'TEXT': 'str' }, keep_default_na=False, na_values='') with jsonlines.open(args.output_file, 'w') as notes_tokenized_file: tokenizer = Tokenizer(args.batch_size, args.n_cpus, args.n_threads, MODE) for i, notes_batch in enumerate(csv_reader): process_batch(notes_batch, i, tokenizer, notes_tokenized_file)
def predict(self, data, pred=None, buckets=8, batch_size=5000, prob=False, **kwargs): r""" Parses the data and produces a parse tree for each sentence. Args: data (str or list[list[str]]): input to be parsed: either - a str, that will be tokenized first with the tokenizer for the parser language - a path to a file to be read, either in CoNLL-U format or in plain text if :param text: is supplied. - a list of lists of tokens text (str): optional, specifies that the input data is in plain text in the specified language code. pred (str or file): a path to a file where to write the parsed input in CoNLL-U fprmat. bucket (int): the number of buckets used to group sentences to parallelize matrix computations. batch_size (int): group sentences in batches. prob (bool): whther to return also probabilities for each arc. Return: a Dataset containing the parsed sentence trees. """ args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.eval() if args.prob: self.transform.append(Field('probs')) if isinstance(data, str) and (not conll_format(data) or args.text): self.transform.reader = Tokenizer(args.text, dir=args.cache_dir, verbose=args.verbose).reader() logger.info("Loading the data") dataset = Dataset(self.transform, data) dataset.build(args.batch_size, args.buckets) logger.info(f"\n{dataset}") logger.info("Making predictions on the dataset") start = datetime.now() preds = self._predict(dataset.loader) elapsed = datetime.now() - start for name, value in preds.items(): setattr(dataset, name, value) if pred is not None and is_master(): logger.info(f"Saving predicted results to {pred}") self.transform.save(pred, dataset.sentences) logger.info( f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s" ) return dataset
def test_download_resources(self): tokenizer = Tokenizer(self.args['lang']) self.assertTrue(os.path.isdir(self.MODEL_DIR)) self.assertTrue( os.path.exists( os.path.join(self.MODEL_DIR, 'tokenizer', self.args['lang']))) self.assertTrue( os.path.exists( os.path.join(self.MODEL_DIR, 'tokenizer', self.args['lang'], 'tokenize')))
def test_download_resources(self): self.assertTrue(not os.path.exists(self.MODEL_DIR)) tokenizer = Tokenizer(**self.args) self.assertTrue( os.path.exists(self.args['dir']) and not os.path.isfile(self.args['dir'])) self.assertTrue( os.path.exists(os.path.join(self.args['dir'], self.args['lang']))) self.assertTrue( os.path.exists( os.path.join(self.args['dir'], self.args['lang'], 'tokenize')))
def __init__(self, normalized= True, classes= None, stemmed= True): if classes is None: classes = ["positive", "negative", "notr"] self.x = [] self.y = [] self.tokenizer = Tokenizer() self.stemmer = TurkishStemmer() self.word2vec = None self.cachefile = "data/data" + ("_normalized" if normalized else "") + ("_stemmed" if stemmed else "") + "_" + ("_".join(classes)) + ".pickle" if os.path.isfile(self.cachefile): with open(self.cachefile, 'rb') as cache: self.x, self.y = pickle.load(cache) else: for cls in classes: self._append_data(cls, normalized, stemmed) with open(self.cachefile, 'wb') as cache: pickle.dump((self.x, self.y), cache)
def first_stats(): tokenizer = Tokenizer() tokenizer.run() question_vocabulary = Vocabulary() questions = load_questions() cc = 0 for question in questions: #print question if cc % 10 == 0: print "\r%s" % cc, cc += 1 sen = tokenizer.predict(question) sen = sen.lower() tokens = question_vocabulary.get_sentence_token_ids(sen) question_list.append(tokens) print "\n Saving..." question_vocabulary.save(Q_VOCAB_NAME) utils.pickle_save(question_list, "question_tokens.dat") print "Done"
def tokenize(path): t = Tokenizer(path) tokens = t.raw_tokenize() items = [] for token in tokens: if token.kind.name == "LITERAL": text = token.spelling cursor_kind = clang.cindex.CursorKind kind = token.cursor.kind if kind == cursor_kind.STRING_LITERAL: # do extra processing on strings text = sha256(mangle_text(token.spelling)).hexdigest()[:10] items.append(text) if token.kind.name == "COMMENT": hashed = sha256(mangle_text(token.spelling[2:])).hexdigest()[:10] items.append(hashed) return "\n".join(items)
def load(cls, path, fields, tokenizer_lang, tokenizer_dir, verbose=True, max_sent_length=math.inf): tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, verbose=verbose) sentences = [] fields = [ field if field is not None else Field(str(i)) for i, field in enumerate(fields) ] with open(path, 'r') as f: lines = [] for line in tokenizer.format(tokenizer.predict(f.read())): line = line.strip() if not line: if len(lines) > max_sent_length: logger.info( 'Discarded sentence longer than max_sent_length:', len(lines), file=sys.stderr) lines = [] continue sentences.append(Sentence(fields, lines)) lines = [] else: if not line.startswith('#'): # append empty columns line += '\t_' * (len(CoNLL._fields) - len(line.split('\t'))) lines.append(line) return cls(fields, sentences)
class XlmEmbedding(TextEmbedding): def __init__(self): pass def get_embedding_fn(self, max_length=12): self.max_length = max_length self.s = Simplifier('tokenizer/zh_mapping.txt') self.t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model', 'tokenizer/lg.all.voc', max_length ) self.max_length = max_length return self.embedding def embedding(self, text): simple = self.s.simplify(text) tokens = self.t.tokenize(simple) accents = run_strip_accents(tokens) ids = self.t.token_to_id(accents) return ids def size(self): return self.t.dico.counts @classmethod def get_feeder(cls): return DenseDataFeeder
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.args.feat in ('char', 'bert'): self.WORD, self.FEAT = self.transform.FORM else: self.WORD, self.FEAT = self.transform.FORM, self.transform.CPOS self.ARC, self.REL = self.transform.HEAD, self.transform.DEPREL self.puncts = torch.tensor([ i for s, i in self.WORD.vocab.stoi.items() if ispunct(s) ]).to(self.args.device) if getattr(self.args, 'text', None): self.transform.reader = Tokenizer(self.args.text, self.args.cache_dir).reader()
def predict(self, X, part_of_speech=None, tagger_preprocessed=False, sentence_level=False): i = 0 if sentence_level: results = [] for text in X: tokenizer = Tokenizer() sentences = tokenizer.tokenize([text]) sentences = [ " ".join([token[0] for token in sentence]) for sentence in sentences ] preprocessed_sentences = self.preprocess_texts( sentences, part_of_speech=part_of_speech, tagger_preprocessed=tagger_preprocessed) X = self.vectorizer.transform(preprocessed_sentences).toarray() pred = self.nb_model.predict(X) results.append(int(round(np.mean(pred)))) print(i) i += 1 return np.array(results) else: preprocessed = self.preprocess_texts( X, part_of_speech=part_of_speech, tagger_preprocessed=tagger_preprocessed) X = self.vectorizer.transform(preprocessed).toarray() return self.nb_model.predict(X)
def genStats(path, helpers): t = Tokenizer(path) tokens = t.raw_tokenize() # stats numLines = 0 numWhitespace = 0 numComments = 0 avgIdentLength = 0 numFunctions = 0 # ident followed by (, declarations and calls numDefines = 0 numMathOps = 0 lenLongestLine = 0 numReturns = 0 # other data idents = [] text = io.readFile(path) lastWasIdent = False # get info from tokens for token in tokens: # look for a comment if token.kind.name == "COMMENT": numComments += 1 # look for math ops if token.spelling in ["+", "-", "*", "/", "|", "&", "+=", "-=", "*=", "/=", ">>=", "<<=", "++", "--", "~", ">>", "!"]: numMathOps += 1 # look for function decs/calls if lastWasIdent and token.spelling == "(": numFunctions += 1 # count the number of returns if token.spelling == "return": numReturns += 1 # add the identifier to the list, set lastWasIdent if token.kind.name == "IDENTIFIER": idents.append(token.spelling) lastWasIdent = True else: lastWasIdent = False # get average ident length total = 0.0 for ident in idents: total += float(len(ident)) avgIdentLength = 0.0 if len(idents) > 0: avgIdentLenth = total / float(len(idents)) # find the number of defines defines = re.findall("#\s*define ", text.lower()) numDefines = len(defines) # find the number of lines lines = text.split("\n") if len(lines) == 1: # ugh, windows lines = text.split("\r") numLines = len(lines) # get the length of the longest line for line in lines: if len(line) > lenLongestLine: lenLongestLine = len(line) # find the total amount of whitespace for char in text: if char in [" ", "\n", "\t", "\r"]: numWhitespace += 1 # create a dict of results and return results = {} results["numLines"] = numLines results["numWhitespace"] = numWhitespace results["numComments"] = numComments results["avgIdentLength"] = avgIdentLength results["numFunctions"] = numFunctions results["numDefines"] = numDefines results["numMathOps"] = numMathOps results["numReturns"] = numReturns results["lenLongestLine"] = lenLongestLine return results
# -*- encoding: utf8 -*- import re import requests import unicodedata from tokenizer.tokenizer import Tokenizer from sklearn.externals import joblib import pandas as pd import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import f1_score from sklearn.svm import SVC from pyvi.pyvi import ViTokenizer from sklearn.metrics import confusion_matrix tokenizer = Tokenizer() tokenizer.run() def load_model(model): print('loading model ...', model) if os.path.isfile(model): return joblib.load(model) else: return None def list_words(mes): words = mes.lower().split() return " ".join(words)
print(' tokenizer algorithm : '+str(args.tokenizer)) if args.v is True: print(' verbose (print debug) : ON') # output directory if not os.path.exists(args.p): os.mkdir(args.p) # (0) torch settings torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic = True device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # (1) tokenizer setting tokenizer = Tokenizer(args.tokenizer, '../tokenizer/e2e.model') # (2) corpus data random.seed(args.seed) lex_flag = True dataset_train = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json', args.corpus.rstrip('/')+'/e2e_valid.json', args.corpus.rstrip('/')+'/e2e_test.json', args.corpus.rstrip('/')+'/e2e-augment/e2e_mr_lex_max_num_token.json', 'train', tokenizer, lex_flag, device) dataset_valid = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json', args.corpus.rstrip('/')+'/e2e_valid.json', args.corpus.rstrip('/')+'/e2e_test.json', args.corpus.rstrip('/')+'/e2e-augment/e2e_mr_lex_max_num_token.json', 'valid', tokenizer, lex_flag, device) dataset_test = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json',
tf.import_graph_def(restored_graph_def, input_map=None, return_elements=None, name="") graph = tf.get_default_graph() doc_ids = graph.get_tensor_by_name('doc_ids:0') doc_mask = graph.get_tensor_by_name('doc_mask:0') doc_type = graph.get_tensor_by_name('doc_type:0') #content = graph.get_tensor_by_name('content:0') doc_output = graph.get_tensor_by_name('doc/output:0') doc_max_length = 12 s = Simplifier('tokenizer/zh_mapping.txt') t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model', 'tokenizer/lg.all.voc', doc_max_length) count = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True #f = ['a real test', 'b false test'] with open(infile, 'r', encoding='utf-8') as f, open(outfile, 'w', encoding='utf-8') as fo: with tf.Session(config=config) as sess: time = datetime.datetime.now() for line in f: simple = s.simplify(line) tokens = t.tokenize(simple) accents = run_strip_accents(tokens) ids = t.token_to_id(accents)
parser.add_option("-x", "--numPartitions", dest="numPartitions", type="int", help="number of partitions", default=10) parser.add_option("-y", "--outputtype", dest="outputtype", type="string", help="output type: csv/json", default="json") parser.add_option("-k", "--topk", dest="topk", type="int", help="top n matches", default=3) parser.add_option("-z", "--candidatesName", dest="candidates_name", type="string", help="name for json element for matching candidates", default="candidates") (c_options, args) = parser.parse_args() print "Got options:", c_options inputFilename = args[0] configFilename = args[1] outputFilename = args[2] tokenizer = Tokenizer(configFilename, c_options) if c_options.inputformat == "text": rdd = tokenizer.tokenize_text_file(sc, inputFilename, c_options.data_type) else: rdd = tokenizer.tokenize_seq_file(sc, inputFilename, c_options.data_type) rdd.partitionBy(c_options.numPartitions) hasher = Hasher(c_options.numHashes, c_options.numItemsInBand, c_options.computeSimilarity) input_lsh_rdd = hasher.compute_hashes(rdd) clusterer = Clusterer(c_options.numPartitions, c_options.computeSimilarity, c_options.threshold) if len(c_options.base) > 0: if len(c_options.baseConfig) > 0: tokenizer = Tokenizer(c_options.baseConfig, c_options)
import os, sys from io import open from tokenizer.tokenizer import Tokenizer import utils import unicodedata import regex import my_map from pyvi.pyvi import ViPosTagger dataset = 'dataset/train' tokenized_dataset = 'dataset/train_tokenized' # dataset = 'dataset/test' # tokenized_dataset = 'dataset/test_tokenized' tokenizer = Tokenizer() r = regex.regex() def tokenizer_dataset(): utils.mkdir(tokenized_dataset) stack = os.listdir(dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print('\r%s' % (file_path)), sys.stdout.flush()
class HobbitGUI(Tk): class Text2(Frame): def __init__(self, master, width=0, height=0, **kwargs): self.width = width self.height = height Frame.__init__(self, master, width=self.width, height=self.height) self.text_widget = Text(self, **kwargs) self.text_widget.pack(expand=YES, fill=BOTH) def pack(self, *args, **kwargs): Frame.pack(self, *args, **kwargs) self.pack_propagate(False) def grid(self, *args, **kwargs): Frame.grid(self, *args, **kwargs) self.grid_propagate(False) def __init__(self, parent=None): Tk.__init__(self, parent) self.parent = parent self.initialize() self.title("Hobbit IDE") self.tz = None self.log = [] def initialize(self): self.grid() spaceTTIF = Label(self) spaceTTIF.grid(column=15) self.inputField = Text(self) self.inputField.grid(column=0, row=0, columnspan=1, rowspan=8, sticky='EW') self.inputParamField = Text(self, width=20, height=10) self.inputParamField.grid(column=1, row=5, columnspan=2, rowspan=4, sticky='EW') self.tokensTable = Listbox(self, width=10, height=15, font='Courier') self.tokensTable.grid(column=3, row=0, columnspan=10, rowspan=4, sticky='EW') self.variableTable = Listbox(self, width=10, height=5, font='Courier') self.variableTable.grid(column=3, row=5, columnspan=10, rowspan=1, sticky='EW') self.constantTable = Listbox(self, width=10, height=5, font='Courier') self.constantTable.grid(column=3, row=7, columnspan=10, rowspan=1, sticky='EW') self.errorsTable = Listbox(self, width=100, height=10, font='Courier') self.errorsTable.grid(column=0, row=9, columnspan=4, sticky="EW") self.errorsTable1 = Listbox(self, width=100, height=10, font='Courier') self.errorsTable1.grid(column=0, row=10, columnspan=4, sticky="EW") self.errorsTable2 = Listbox(self, width=100, height=10, font='Courier') self.errorsTable2.grid(column=0, row=11, columnspan=4, sticky="EW") self.inputField.focus_set() btnParse = Button(self, text="Parse", command=self.on_button_parse) btnParse.grid(column=2, row=0) btnAnalyze = Button(self, text='Analyze', command=self.analyze) btnAnalyze.grid(column=2, row=1) btnRun = Button(self, text='POLIZ', command=self.on_button_translate) btnRun.grid(column=2, row=2) btnRun = Button(self, text='RUN', command=self.run) btnRun.grid(column=2, row=3) def test(self): source_code = self.inputField.get('1.0', 'end').split('\n') for i in source_code: print(i) def analyze(self): self.errorsTable1.delete(0, END) if self.tz is None: self.errorsTable.insert(END, "No data to analyze.") try: a = OPGAnalyzer(self.tz['tokens'][:-2], grammar=grammar, grammar_elements=grammar_elements) self.log = [i for i in a.analyze()] for i in self.log: print(i) self.errorsTable1.insert(END, '| {iteration:3} | {stack:30.30} | {relation:1.1} | {input:20.20} | ' '{rpn:30.30} |' .format(iteration=i['iteration'], stack=str(i['stack']), relation=i['relation'], input=str(i['input']), rpn=str(i['rpn']))) self.errorsTable.insert(END, 'OK') except Exception as e: self.errorsTable.insert(END, 'At line {!s} you have an error in symbol {}'.format(e.args[0].line_number + 1, e.args[0].name)) def on_button_parse(self): self.tz = None self.tokensTable.delete(0, END) self.variableTable.delete(0, END) self.constantTable.delete(0, END) self.errorsTable.delete(0, END) source_code = self.inputField.get('1.0', 'end').split('\n') for i in range(len(source_code)): source_code[i] += '\n' # analyzing of input file self.tz = Tokenizer(source_to_analyze=source_code) try: self.tz = self.tz.analyze() self.errorsTable.insert(END, "OK") except Exception as e: self.errorsTable.insert(END, e) for token in self.tz['tokens'][:-2]: self.tokensTable.insert(END, token) for i in self.tz['variables']: self.variableTable.insert(END, i) for i in self.tz['constants']: self.constantTable.insert(END, i) def on_button_translate(self): self.source = Translator().translate(self.tz['tokens']) self.errorsTable2.insert(END, self.source) def run(self): from hobbit_lib.rpn.executor import execute print(self.source) execute(self.source)
def test_tokenize(self): tokenizer = Tokenizer(**self.args) sentences = tokenizer.predict( 'Domani vorrei andare al mare.Speriamo faccia bel tempo.') self.assertEqual(len(sentences), 2)
parser.add_argument('-model_NLU', help='NLU model file', default='model_038.dat') parser.add_argument('-search_NLG', help='NLG search', choices=['best', 'greedy'], default='best') parser.add_argument('-tokenizer', help='tokenizer ([nltk]|sentencepiece)', choices=['nltk', 'sentencepiece'], default='nltk') args = parser.parse_args() print('** generate augmented data **') # tokenizer tokenizer = Tokenizer(args.tokenizer, '../../tokenizer/e2e.model') ## ## generate MRaug data ## print('** generate MR augmented data **') # collect MR values mr_list = { 'name': [], 'eatType': [], 'food': [], 'priceRange': [], 'customer rating': [], 'area': [], 'familyFriendly': [], 'near': []
class NLG(): def __init__(self, NLG_param_dir, NLG_model_fname, tokenizer, NLU_param_dir=None, NLU_model_fname=None): self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model') self.tokenizer_mode = tokenizer self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') saved_data = torch.load( NLG_param_dir.rstrip('/') + '/' + NLG_model_fname) self.model_NLG = saved_data['model'] f = open(NLG_param_dir.rstrip('/') + '/dictionary.json', 'r', encoding='utf-8') self.dictionary = json.load(f) f.close() # beam-search settings self.n_beam = 5 # NLU if (NLU_param_dir is not None) and (NLU_model_fname is not None): self.NLU = NLU(NLU_param_dir, NLU_model_fname, tokenizer) else: self.NLU = None def convert_nlg(self, input_mr_obj, search, lex_flag, startword=''): def _shape_txt(input_mr_obj, output_token, lex_flag): if self.tokenizer_mode == 'sentencepiece': output_txt = ''.join(output_token).replace('▁', ' ') output_txt = output_txt.lstrip(' ') else: output_txt = '' for i in range(len(output_token)): if (i > 0) and (output_token[i] != '.') and ( output_token[i] != ',') and (output_token[i][0] != '\''): output_txt += ' ' output_txt += output_token[i] # Lexicalisation if lex_flag is True: output_txt = output_txt.replace('NAME', input_mr_obj['name']) output_txt = output_txt.replace('NEAR', input_mr_obj['near']) return output_txt input_mr_obj_org = copy.deepcopy(input_mr_obj) if lex_flag is True: if input_mr_obj['name'] != '': input_mr_obj['name'] = 'NAME' if input_mr_obj['near'] != '': input_mr_obj['near'] = 'NEAR' input_mr_token = self.tokenizer.mr(input_mr_obj) if search == 'greedy': output_txt_token, attention = self.translate_nlg_greedy_search( input_mr_token, startword) elif search == 'beam': output_txt_token, attention = self.translate_nlg_beam_search( input_mr_token, lex_flag, startword) else: output_txt_token, attention = self.translate_nlg( input_mr_token, lex_flag, startword) output_txt = _shape_txt(input_mr_obj_org, output_txt_token, lex_flag) return output_txt, attention def translate_nlg_encode(self, input_mr_token): mr_indexes = [] for token in input_mr_token: if token in self.dictionary['mr_s2i']: mr_indexes.append(self.dictionary['mr_s2i'][token]) else: mr_indexes.append(self.dictionary['mr_s2i']['<unk>']) mr_tensor = torch.LongTensor(mr_indexes).unsqueeze(0).to(self.device) mr_mask = self.model_NLG.make_mr_mask(mr_tensor) with torch.no_grad(): enc_mr = self.model_NLG.encoder(mr_tensor, mr_mask) return enc_mr, mr_mask def translate_nlg_greedy_search(self, input_mr_token, startword=''): self.model_NLG.eval() ## encode enc_mr, mr_mask = self.translate_nlg_encode(input_mr_token) ## decode # startword token_startword = self.tokenizer.txt(startword) txt_indexes = [self.dictionary['txt_s2i']['<sos>']] for token in token_startword: if token in self.dictionary['txt_s2i']: txt_indexes.append(self.dictionary['txt_s2i'][token]) else: txt_indexes.append(self.dictionary['txt_s2i']['<unk>']) num_token = len(txt_indexes) for i in range(self.dictionary['max_txt_length'] - num_token): txt_tensor = torch.LongTensor(txt_indexes).unsqueeze(0).to( self.device) txt_mask = self.model_NLG.make_txt_mask(txt_tensor) with torch.no_grad(): output, attention = self.model_NLG.decoder( txt_tensor, enc_mr, txt_mask, mr_mask) pred_token = output.argmax(2)[:, -1].item() txt_indexes.append(pred_token) if pred_token == self.dictionary['txt_s2i']['<eos>']: break txt_tokens = [self.dictionary['txt_i2s'][i] for i in txt_indexes] txt_tokens = txt_tokens[1:-1] return txt_tokens, attention def translate_nlg_beam_search(self, input_mr_token, lex_flag, startword=''): self.model_NLG.eval() ## encode enc_mr, mr_mask = self.translate_nlg_encode(input_mr_token) ## decode # startword token_startword = self.tokenizer.txt(startword) offset = len(token_startword) a_cand_prev = [{ 'idx': [self.dictionary['txt_s2i']['<sos>']], 'val': 1.0 }] for token in token_startword: if token in self.dictionary['txt_s2i']: a_cand_prev[0]['idx'].append(self.dictionary['txt_s2i'][token]) else: a_cand_prev[0]['idx'].append( self.dictionary['txt_s2i']['<unk>']) num_token = len(a_cand_prev[0]['idx']) a_out = [] for i in range(self.dictionary['max_txt_length'] - num_token): a_cand = [] for j in range(len(a_cand_prev)): txt_tensor = torch.LongTensor( a_cand_prev[j]['idx']).unsqueeze(0).to(self.device) txt_mask = self.model_NLG.make_txt_mask(txt_tensor) with torch.no_grad(): output, attention = self.model_NLG.decoder( txt_tensor, enc_mr, txt_mask, mr_mask) output = torch.softmax(output, dim=-1) for n in range(self.n_beam): a_cand.append(copy.deepcopy(a_cand_prev[j])) idx = (torch.argsort(output, axis=2)[0, i + offset, -(n + 1)]).item() val = output[0, i + offset, idx].item() a_cand[len(a_cand) - 1]['idx'].append(idx) a_cand[len(a_cand) - 1]['val'] *= val a_cand_sort = sorted(a_cand, key=lambda x: x['val'], reverse=True) a_cand_prev = [] nloop = min(len(a_cand_sort), self.n_beam) for j in range(nloop): if a_cand_sort[j]['idx'][ len(a_cand_sort[j]['idx']) - 1] == self.dictionary['txt_s2i']['<eos>']: a_out.append(a_cand_sort[j]) if len(a_out) == self.n_beam: break else: a_cand_prev.append(a_cand_sort[j]) if len(a_out) == self.n_beam: break if lex_flag is False: ref_mr_token = input_mr_token else: tmp_mr_text = '' for token in input_mr_token: tmp_mr_text += token tmp_mr_list = tmp_mr_text.split('|') if tmp_mr_list[0] != '': tmp_mr_list[0] = 'NAME' if tmp_mr_list[7] != '': tmp_mr_list[7] = 'NEAR' tmp_mr_obj = { 'name': tmp_mr_list[0], 'eatType': tmp_mr_list[1], 'food': tmp_mr_list[2], 'priceRange': tmp_mr_list[3], 'customer rating': tmp_mr_list[4], 'area': tmp_mr_list[5], 'familyFriendly': tmp_mr_list[6], 'near': tmp_mr_list[7] } ref_mr_token = self.tokenizer.mr(tmp_mr_obj) flag = False for n in range(len(a_out)): txt_tokens_tmp = [ self.dictionary['txt_i2s'][idx] for idx in a_out[n]['idx'] ] nlu_output_token, _ = self.NLU.translate_nlu_greedy_search( txt_tokens_tmp[1:-1]) if nlu_output_token == ref_mr_token: txt_tokens = txt_tokens_tmp[1:-1] flag = True break if flag is False: if len(a_out) > 0: txt_tokens = [ self.dictionary['txt_i2s'][idx] for idx in a_out[0]['idx'] ] txt_tokens = txt_tokens[1:-1] else: txt_tokens, attention = self.translate_nlg_greedy_search( input_mr_token, 'single', startword) return txt_tokens, attention def translate_nlg(self, input_mr_token, lex_flag, startword=''): self.model_NLG.eval() ## encode enc_mr, mr_mask = self.translate_nlg_encode(input_mr_token) ## decode # startword token_startword = self.tokenizer.txt(startword) offset = len(token_startword) # greedy search txt_indexes = [self.dictionary['txt_s2i']['<sos>']] for token in token_startword: if token in self.dictionary['txt_s2i']: txt_indexes.append(self.dictionary['txt_s2i'][token]) else: txt_indexes.append(self.dictionary['txt_s2i']['<unk>']) num_token = len(txt_indexes) for i in range(self.dictionary['max_txt_length'] - num_token): txt_tensor = torch.LongTensor(txt_indexes).unsqueeze(0).to( self.device) txt_mask = self.model_NLG.make_txt_mask(txt_tensor) with torch.no_grad(): output, attention = self.model_NLG.decoder( txt_tensor, enc_mr, txt_mask, mr_mask) pred_token = output.argmax(2)[:, -1].item() txt_indexes.append(pred_token) if pred_token == self.dictionary['txt_s2i']['<eos>']: break txt_tokens_greedy = [ self.dictionary['txt_i2s'][i] for i in txt_indexes ] attention_greedy = attention nlu_output_token, _ = self.NLU.translate_nlu_greedy_search( txt_tokens_greedy[1:-1]) if lex_flag is False: ref_mr_token = input_mr_token else: tmp_mr_text = '' for token in input_mr_token: tmp_mr_text += token tmp_mr_list = tmp_mr_text.split('|') if tmp_mr_list[0] != '': tmp_mr_list[0] = 'NAME' if tmp_mr_list[7] != '': tmp_mr_list[7] = 'NEAR' tmp_mr_obj = { 'name': tmp_mr_list[0], 'eatType': tmp_mr_list[1], 'food': tmp_mr_list[2], 'priceRange': tmp_mr_list[3], 'customer rating': tmp_mr_list[4], 'area': tmp_mr_list[5], 'familyFriendly': tmp_mr_list[6], 'near': tmp_mr_list[7] } ref_mr_token = self.tokenizer.mr(tmp_mr_obj) if nlu_output_token == ref_mr_token: txt_tokens = txt_tokens_greedy[1:-1] attention = attention_greedy else: a_cand_prev = [{ 'idx': [self.dictionary['txt_s2i']['<sos>']], 'val': 1.0 }] for token in token_startword: if token in self.dictionary['txt_s2i']: a_cand_prev[0]['idx'].append( self.dictionary['txt_s2i'][token]) else: a_cand_prev[0]['idx'].append( self.dictionary['txt_s2i']['<unk>']) num_token = len(a_cand_prev[0]['idx']) a_out = [] for i in range(self.dictionary['max_txt_length'] - num_token): a_cand = [] for j in range(len(a_cand_prev)): txt_tensor = torch.LongTensor( a_cand_prev[j]['idx']).unsqueeze(0).to(self.device) txt_mask = self.model_NLG.make_txt_mask(txt_tensor) with torch.no_grad(): output, attention = self.model_NLG.decoder( txt_tensor, enc_mr, txt_mask, mr_mask) output = torch.softmax(output, dim=-1) for n in range(self.n_beam): a_cand.append(copy.deepcopy(a_cand_prev[j])) idx = (torch.argsort(output, axis=2)[0, i + offset, -(n + 1)]).item() val = output[0, i + offset, idx].item() a_cand[len(a_cand) - 1]['idx'].append(idx) a_cand[len(a_cand) - 1]['val'] *= val a_cand_sort = sorted(a_cand, key=lambda x: x['val'], reverse=True) a_cand_prev = [] nloop = min(len(a_cand_sort), self.n_beam) for j in range(nloop): if a_cand_sort[j]['idx'][ len(a_cand_sort[j]['idx']) - 1] == self.dictionary['txt_s2i']['<eos>']: a_out.append(a_cand_sort[j]) if len(a_out) == self.n_beam: break else: a_cand_prev.append(a_cand_sort[j]) if len(a_out) == self.n_beam: break flag = False for n in range(len(a_out)): txt_tokens_tmp = [ self.dictionary['txt_i2s'][idx] for idx in a_out[n]['idx'] ] nlu_output_token, _ = self.NLU.translate_nlu_greedy_search( txt_tokens_tmp[1:-1]) if nlu_output_token == ref_mr_token: txt_tokens = txt_tokens_tmp[1:-1] flag = True break if flag is False: txt_tokens = txt_tokens_greedy[1:-1] attention = attention_greedy return txt_tokens, attention
:licence GPLv2 Good luck and have fun with this simple lexical analyzer for hobbit programming language. """ import json from os import getcwd from sys import argv from syntax_analizer.syntax_analyzer import SyntaxAnalyzer from tokenizer.tokenizer import Tokenizer try: input_file = open(getcwd()+'/'+argv[1]) # analyzing of input file tz = Tokenizer(input_file) try: tz = tz.analyze() except Exception as e: print(e) input_file.close() # writing analyzed data to json file try: output_file = open(getcwd()+'/'+argv[2], 'w') except Exception: output_file = open(getcwd()+'/output.json', 'w') json.dump({'tokens': [i.toDict() for i in tz['tokens']], 'variables': [i.toDict() for i in tz['variables']], 'constants': [i.toDict() for i in tz['constants']]}, output_file, separators=(',', ':'), indent=4)
class HobbitGUI(Tk): def __init__(self, parent=None): Tk.__init__(self, parent) self.parent = parent self.initialize() self.title("Hobbit IDE") self.tz = None def initialize(self): self.grid() spaceTTIF = Label(self) spaceTTIF.grid(column=5) self.inputField = Text(self) self.inputField.grid(column=0, row=0, columnspan=5, rowspan=14, sticky='EW') self.tokensTable = Listbox(self, width=100, height=15, font='Courier') self.tokensTable.grid(column=6, row=0, columnspan=10, rowspan=10, sticky='EW') self.variableTable = Listbox(self, width=100, height=10, font='Courier') self.variableTable.grid(column=6, row=10, columnspan=10, rowspan=7, sticky='EW') self.constantTable = Listbox(self, width=100, height=10, font='Courier') self.constantTable.grid(column=6, row=17, columnspan=10, rowspan=7, sticky='EW') self.errorsTable = Listbox(self, width=70, height=10, font='Courier') self.errorsTable.grid(column=0, row=18, columnspan=5, sticky="EW") self.inputField.focus_set() btnParse = Button(self, text="Parse", command=self.on_button_parse) btnParse.grid(column=3, row=17) btnAnalyze = Button(self, text='Analyze', command=self.analyze) btnAnalyze.grid(column=2, row=17) btnAnalyze2 = Button(self, text='Analyze2', command=self.analyze2) btnAnalyze2.grid(column=1, row=17) def test(self): source_code = self.inputField.get('1.0', 'end').split('\n') for i in source_code: print(i) def analyze(self): if self.tz is None: self.errorsTable.insert(END, "No data to analyze.") try: a = SyntaxAnalyzer(self.tz) a.analyze() self.errorsTable.insert(END, 'OK') except Exception as e: for i in e.__str__().split('\n'): self.errorsTable.insert(END, i) def analyze2(self): if self.tz is None: self.errorsTable.insert(END, "No data to analyze.") else: __analyze_input = [] for i in self.tz['tokens']: __analyze_input.append(i.toDict()) __analyze_output = syntax_analyser.move(__analyze_input) for i in __analyze_output: self.errorsTable.insert(END, i) def on_button_parse(self): self.tz = None self.tokensTable.delete(0, END) self.variableTable.delete(0, END) self.constantTable.delete(0, END) self.errorsTable.delete(0, END) source_code = self.inputField.get('1.0', 'end').split('\n') for i in range(len(source_code)): source_code[i] += '\n' # analyzing of input file self.tz = Tokenizer(source_to_analyze=source_code) try: self.tz = self.tz.analyze() self.errorsTable.insert(END, "OK") except IndexError: pass except Exception as e: self.errorsTable.insert(END, e) for token in self.tz['tokens']: self.tokensTable.insert(END, token) for i in self.tz['variables']: self.variableTable.insert(END, i) for i in self.tz['constants']: self.constantTable.insert(END, i)
import pickle from bm25 import BM25 # Special vocabulary symbols - we always put them at the start. _PAD = b"_PAD" _GO = b"_GO" _EOS = b"_EOS" _UNK = b"_UNK" _START_VOCAB = [_PAD, _GO, _EOS, _UNK] PAD_ID = 0 GO_ID = 1 EOS_ID = 2 UNK_ID = 3 default_tokenizer = Tokenizer(_UNK) bpe_tokenizer = BPETokenizer( open("/home/martin/projects/subword-nmt/vocab_bpe_merged"), _START_VOCAB) def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, dataset_reader, tokenizer=default_tokenizer, persist_counts=False): """Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size.