def main(args): if os.path.exists(args.output_file): raise FileExistsError('File exists: {}'.format(args.output_file)) csv_reader = pd.read_csv(args.input_file, chunksize=args.batch_size, usecols=[ 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CATEGORY', 'DESCRIPTION', 'ISERROR', 'TEXT' ], dtype={ 'SUBJECT_ID': int32, 'HADM_ID': 'str', 'CATEGORY': 'str', 'DESCRIPTION': 'str', 'ISERROR': 'str', 'TEXT': 'str' }, keep_default_na=False, na_values='') with jsonlines.open(args.output_file, 'w') as notes_tokenized_file: tokenizer = Tokenizer(args.batch_size, args.n_cpus, args.n_threads, MODE) for i, notes_batch in enumerate(csv_reader): process_batch(notes_batch, i, tokenizer, notes_tokenized_file)
def load(cls, path, fields, tokenizer_lang, tokenizer_dir, use_gpu, verbose=True, max_sent_length=math.inf): tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, use_gpu=use_gpu, verbose=True) sentences = [] fields = [field if field is not None else Field(str(i)) for i, field in enumerate(fields)] with open(path, 'r') as f: lines = [] for line in tokenizer.format(tokenizer.predict(f.read())): line = line.strip() if not line: if len(lines) > max_sent_length: logger.info('Discarded sentence longer than max_sent_length:', len(lines), file=sys.stderr) lines = [] continue sentences.append(Sentence(fields, lines)) lines = [] else: if not line.startswith('#'): # append fake columns line = '{}\t{}'.format(line, '\t'.join(['_' for i in range(len(CoNLL._fields) - len(line.split('\t')))])) assert len(CoNLL._fields) == len(line.split('\t')), '{} - {} vs {}'.format(line, len(CoNLL._fields), len(line.split())) lines.append(line) return cls(fields, sentences)
def __init__(self, NLG_param_dir, NLG_model_fname, tokenizer, NLU_param_dir=None, NLU_model_fname=None): self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model') self.tokenizer_mode = tokenizer self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') saved_data = torch.load( NLG_param_dir.rstrip('/') + '/' + NLG_model_fname) self.model_NLG = saved_data['model'] f = open(NLG_param_dir.rstrip('/') + '/dictionary.json', 'r', encoding='utf-8') self.dictionary = json.load(f) f.close() # beam-search settings self.n_beam = 5 # NLU if (NLU_param_dir is not None) and (NLU_model_fname is not None): self.NLU = NLU(NLU_param_dir, NLU_model_fname, tokenizer) else: self.NLU = None
def initialize_tokenizer(vocabulary_path, is_bpe=True): """Initialize vocabulary from file. We assume the vocabulary is stored one-item-per-line, so a file: dog cat will result in a vocabulary {"dog": 0, "cat": 1}, and this function will also return the reversed-vocabulary ["dog", "cat"]. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if is_bpe: return bpe_tokenizer if os.path.isfile(vocabulary_path): rev_vocab = [] with open(vocabulary_path, mode="r") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] return Tokenizer(_UNK, vocab_list=rev_vocab) else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def test_corpus_load(self): tokenizer = Tokenizer(**self.args) raw_text_file = '/project/piqasso/Collection/IWPT20/train-dev/UD_Italian-ISDT/it_isdt-ud-dev.txt' with open(raw_text_file) as fin: for line in tokenizer.format(tokenizer.predict(fin.read())): if line and not line.startswith('#'): assert len(line.split('\t')) == 2, line
def get_embedding_fn(self, max_length=12): self.max_length = max_length self.s = Simplifier('tokenizer/zh_mapping.txt') self.t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model', 'tokenizer/lg.all.voc', max_length ) self.max_length = max_length return self.embedding
def test_corpus_load(self): tokenizer = Tokenizer(**self.args) sin = io.StringIO( "Un corazziere contro Scalfaro. L'attore le disse baciami o torno a riprendermelo." ) for line in tokenizer.format(tokenizer.predict(sin.read())): if line and not line.startswith('#'): assert len(line.split('\t')) == 10, line
def predict(self, data, pred=None, buckets=8, batch_size=5000, prob=False, **kwargs): r""" Parses the data and produces a parse tree for each sentence. Args: data (str or list[list[str]]): input to be parsed: either - a str, that will be tokenized first with the tokenizer for the parser language - a path to a file to be read, either in CoNLL-U format or in plain text if :param text: is supplied. - a list of lists of tokens text (str): optional, specifies that the input data is in plain text in the specified language code. pred (str or file): a path to a file where to write the parsed input in CoNLL-U fprmat. bucket (int): the number of buckets used to group sentences to parallelize matrix computations. batch_size (int): group sentences in batches. prob (bool): whther to return also probabilities for each arc. Return: a Dataset containing the parsed sentence trees. """ args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.eval() if args.prob: self.transform.append(Field('probs')) if isinstance(data, str) and (not conll_format(data) or args.text): self.transform.reader = Tokenizer(args.text, dir=args.cache_dir, verbose=args.verbose).reader() logger.info("Loading the data") dataset = Dataset(self.transform, data) dataset.build(args.batch_size, args.buckets) logger.info(f"\n{dataset}") logger.info("Making predictions on the dataset") start = datetime.now() preds = self._predict(dataset.loader) elapsed = datetime.now() - start for name, value in preds.items(): setattr(dataset, name, value) if pred is not None and is_master(): logger.info(f"Saving predicted results to {pred}") self.transform.save(pred, dataset.sentences) logger.info( f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s" ) return dataset
def __init__(self, param_dir, model_fname, tokenizer): self.tokenizer = Tokenizer(tokenizer, '../tokenizer/e2e.model') self.tokenizer_mode = tokenizer self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') saved_data = torch.load(param_dir.rstrip('/')+'/'+model_fname) self.model = saved_data['model'] f = open(param_dir.rstrip('/')+'/dictionary.json', 'r', encoding='utf-8') self.dictionary = json.load(f) f.close() # beam-search settings self.n_beam = 5
def test_download_resources(self): self.assertTrue(not os.path.exists(self.MODEL_DIR)) tokenizer = Tokenizer(**self.args) self.assertTrue( os.path.exists(self.args['dir']) and not os.path.isfile(self.args['dir'])) self.assertTrue( os.path.exists(os.path.join(self.args['dir'], self.args['lang']))) self.assertTrue( os.path.exists( os.path.join(self.args['dir'], self.args['lang'], 'tokenize')))
def test_download_resources(self): tokenizer = Tokenizer(self.args['lang']) self.assertTrue(os.path.isdir(self.MODEL_DIR)) self.assertTrue( os.path.exists( os.path.join(self.MODEL_DIR, 'tokenizer', self.args['lang']))) self.assertTrue( os.path.exists( os.path.join(self.MODEL_DIR, 'tokenizer', self.args['lang'], 'tokenize')))
def main(): dataset = data_loader.load_text_file("data_2.txt") tokenizer = Tokenizer() separated = tokenizer.tokenize([dataset]) morfeusz = MorfeuszWrapperLexeme() for sentence in separated: analysed = morfeusz.analyse([w for w, tag in sentence]) print(analysed) for word, analysis in analysed.items(): print("{}:".format(word)) print_analysis(analysis) print()
def tokenize(path): t = Tokenizer(path) tokens = t.raw_tokenize() idents = [] for token in tokens: if token.kind.name == "IDENTIFIER": name = token.spelling.lower() name = re.sub("_", "", name) idents.append(name) return "\n".join(idents)
def main(): text = 'Charakteryzował się on ustawieniem zawodników w kształcie piramidy' \ ' – bramkarz - 2 obrońców - 3 pomocników - 5 napastników (1-2-3-5).' morfeusz = MorfeuszWrapperLexeme() tokenizer = Tokenizer() text = tokenizer.tokenize([text]) for sen in text: analysed = morfeusz.analyse([w for w, tag in sen], as_xml=False) print(analysed) analysed = morfeusz.analyse([w for w, tag in sen], as_xml=True) print(analysed) print()
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.args.feat in ('char', 'bert'): self.WORD, self.FEAT = self.transform.FORM else: self.WORD, self.FEAT = self.transform.FORM, self.transform.CPOS self.ARC, self.REL = self.transform.HEAD, self.transform.DEPREL self.puncts = torch.tensor([ i for s, i in self.WORD.vocab.stoi.items() if ispunct(s) ]).to(self.args.device) if getattr(self.args, 'text', None): self.transform.reader = Tokenizer(self.args.text, self.args.cache_dir).reader()
def test_ner(crf, test_sent): from tokenizer.tokenizer import Tokenizer token = Tokenizer() token.run() arr_featurized_sent = [] postaged_sent = ViPosTagger.postagging(token.predict(test_sent)) print postaged_sent test_arr = [] for i in xrange(len(postaged_sent[0])): test_arr.append((postaged_sent[0][i], postaged_sent[1][i])) print test_arr featurized_sent = sent2features(test_arr) arr_featurized_sent.append(featurized_sent) predict = crf.predict(arr_featurized_sent) return zip(test_arr, predict[0])
def __init__(self, normalized= True, classes= None, stemmed= True): if classes is None: classes = ["positive", "negative", "notr"] self.x = [] self.y = [] self.tokenizer = Tokenizer() self.stemmer = TurkishStemmer() self.word2vec = None self.cachefile = "data/data" + ("_normalized" if normalized else "") + ("_stemmed" if stemmed else "") + "_" + ("_".join(classes)) + ".pickle" if os.path.isfile(self.cachefile): with open(self.cachefile, 'rb') as cache: self.x, self.y = pickle.load(cache) else: for cls in classes: self._append_data(cls, normalized, stemmed) with open(self.cachefile, 'wb') as cache: pickle.dump((self.x, self.y), cache)
def first_stats(): tokenizer = Tokenizer() tokenizer.run() question_vocabulary = Vocabulary() questions = load_questions() cc = 0 for question in questions: #print question if cc % 10 == 0: print "\r%s" % cc, cc += 1 sen = tokenizer.predict(question) sen = sen.lower() tokens = question_vocabulary.get_sentence_token_ids(sen) question_list.append(tokens) print "\n Saving..." question_vocabulary.save(Q_VOCAB_NAME) utils.pickle_save(question_list, "question_tokens.dat") print "Done"
def tokenize(path): t = Tokenizer(path) tokens = t.raw_tokenize() items = [] for token in tokens: if token.kind.name == "LITERAL": text = token.spelling cursor_kind = clang.cindex.CursorKind kind = token.cursor.kind if kind == cursor_kind.STRING_LITERAL: # do extra processing on strings text = sha256(mangle_text(token.spelling)).hexdigest()[:10] items.append(text) if token.kind.name == "COMMENT": hashed = sha256(mangle_text(token.spelling[2:])).hexdigest()[:10] items.append(hashed) return "\n".join(items)
def load(cls, path, fields, tokenizer_lang, tokenizer_dir, verbose=True, max_sent_length=math.inf): tokenizer = Tokenizer(lang=tokenizer_lang, dir=tokenizer_dir, verbose=verbose) sentences = [] fields = [ field if field is not None else Field(str(i)) for i, field in enumerate(fields) ] with open(path, 'r') as f: lines = [] for line in tokenizer.format(tokenizer.predict(f.read())): line = line.strip() if not line: if len(lines) > max_sent_length: logger.info( 'Discarded sentence longer than max_sent_length:', len(lines), file=sys.stderr) lines = [] continue sentences.append(Sentence(fields, lines)) lines = [] else: if not line.startswith('#'): # append empty columns line += '\t_' * (len(CoNLL._fields) - len(line.split('\t'))) lines.append(line) return cls(fields, sentences)
def predict(self, X, part_of_speech=None, tagger_preprocessed=False, sentence_level=False): i = 0 if sentence_level: results = [] for text in X: tokenizer = Tokenizer() sentences = tokenizer.tokenize([text]) sentences = [ " ".join([token[0] for token in sentence]) for sentence in sentences ] preprocessed_sentences = self.preprocess_texts( sentences, part_of_speech=part_of_speech, tagger_preprocessed=tagger_preprocessed) X = self.vectorizer.transform(preprocessed_sentences).toarray() pred = self.nb_model.predict(X) results.append(int(round(np.mean(pred)))) print(i) i += 1 return np.array(results) else: preprocessed = self.preprocess_texts( X, part_of_speech=part_of_speech, tagger_preprocessed=tagger_preprocessed) X = self.vectorizer.transform(preprocessed).toarray() return self.nb_model.predict(X)
print(' tokenizer algorithm : '+str(args.tokenizer)) if args.v is True: print(' verbose (print debug) : ON') # output directory if not os.path.exists(args.p): os.mkdir(args.p) # (0) torch settings torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic = True device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # (1) tokenizer setting tokenizer = Tokenizer(args.tokenizer, '../tokenizer/e2e.model') # (2) corpus data random.seed(args.seed) lex_flag = True dataset_train = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json', args.corpus.rstrip('/')+'/e2e_valid.json', args.corpus.rstrip('/')+'/e2e_test.json', args.corpus.rstrip('/')+'/e2e-augment/e2e_mr_lex_max_num_token.json', 'train', tokenizer, lex_flag, device) dataset_valid = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json', args.corpus.rstrip('/')+'/e2e_valid.json', args.corpus.rstrip('/')+'/e2e_test.json', args.corpus.rstrip('/')+'/e2e-augment/e2e_mr_lex_max_num_token.json', 'valid', tokenizer, lex_flag, device) dataset_test = MyDataset(args.corpus.rstrip('/')+'/e2e_train.json',
# -*- encoding: utf8 -*- import re import requests import unicodedata from tokenizer.tokenizer import Tokenizer from sklearn.externals import joblib import pandas as pd import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import f1_score from sklearn.svm import SVC from pyvi.pyvi import ViTokenizer from sklearn.metrics import confusion_matrix tokenizer = Tokenizer() tokenizer.run() def load_model(model): print('loading model ...', model) if os.path.isfile(model): return joblib.load(model) else: return None def list_words(mes): words = mes.lower().split() return " ".join(words)
def main(): dataset = data_loader.load_text_file("data_2.txt") tokenizer = Tokenizer() output = tokenizer.tokenize([dataset]) for sentence in output: print(sentence)
tf.import_graph_def(restored_graph_def, input_map=None, return_elements=None, name="") graph = tf.get_default_graph() doc_ids = graph.get_tensor_by_name('doc_ids:0') doc_mask = graph.get_tensor_by_name('doc_mask:0') doc_type = graph.get_tensor_by_name('doc_type:0') #content = graph.get_tensor_by_name('content:0') doc_output = graph.get_tensor_by_name('doc/output:0') doc_max_length = 12 s = Simplifier('tokenizer/zh_mapping.txt') t = Tokenizer('tokenizer/spiece_all_bpe/spiece.all.bpe.130000.lower.model', 'tokenizer/lg.all.voc', doc_max_length) count = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True #f = ['a real test', 'b false test'] with open(infile, 'r', encoding='utf-8') as f, open(outfile, 'w', encoding='utf-8') as fo: with tf.Session(config=config) as sess: time = datetime.datetime.now() for line in f: simple = s.simplify(line) tokens = t.tokenize(simple) accents = run_strip_accents(tokens) ids = t.token_to_id(accents)
def test_tokenize(self): tokenizer = Tokenizer(self.args['lang']) sentences = tokenizer.predict( 'Ha chiamato il dr. Rossi.Vuole salutarti.') self.assertEqual(len(sentences), 2)
def test_tokenize(self): tokenizer = Tokenizer(**self.args) sentences = tokenizer.predict( 'Domani vorrei andare al mare.Speriamo faccia bel tempo.') self.assertEqual(len(sentences), 2)
def genStats(path, helpers): t = Tokenizer(path) tokens = t.raw_tokenize() # stats numLines = 0 numWhitespace = 0 numComments = 0 avgIdentLength = 0 numFunctions = 0 # ident followed by (, declarations and calls numDefines = 0 numMathOps = 0 lenLongestLine = 0 numReturns = 0 # other data idents = [] text = io.readFile(path) lastWasIdent = False # get info from tokens for token in tokens: # look for a comment if token.kind.name == "COMMENT": numComments += 1 # look for math ops if token.spelling in [ "+", "-", "*", "/", "|", "&", "+=", "-=", "*=", "/=", ">>=", "<<=", "++", "--", "~", ">>", "!" ]: numMathOps += 1 # look for function decs/calls if lastWasIdent and token.spelling == "(": numFunctions += 1 # count the number of returns if token.spelling == "return": numReturns += 1 # add the identifier to the list, set lastWasIdent if token.kind.name == "IDENTIFIER": idents.append(token.spelling) lastWasIdent = True else: lastWasIdent = False # get average ident length total = 0.0 for ident in idents: total += float(len(ident)) avgIdentLength = 0.0 if len(idents) > 0: avgIdentLenth = total / float(len(idents)) # find the number of defines defines = re.findall("#\s*define ", text.lower()) numDefines = len(defines) # find the number of lines lines = text.split("\n") if len(lines) == 1: # ugh, windows lines = text.split("\r") numLines = len(lines) # get the length of the longest line for line in lines: if len(line) > lenLongestLine: lenLongestLine = len(line) # find the total amount of whitespace for char in text: if char in [" ", "\n", "\t", "\r"]: numWhitespace += 1 # create a dict of results and return results = {} results["numLines"] = numLines results["numWhitespace"] = numWhitespace results["numComments"] = numComments results["avgIdentLength"] = avgIdentLength results["numFunctions"] = numFunctions results["numDefines"] = numDefines results["numMathOps"] = numMathOps results["numReturns"] = numReturns results["lenLongestLine"] = lenLongestLine return results
print('startword: ' + startword) print('** output **') # MR(LEX) -> TXT_LEX nlg_txt, attention_txt = NLG_model.convert_nlg(mr_obj, args.search, lex_flag, startword) obj_txt = {'txt': nlg_txt} print('TXT: ' + obj_txt['txt']) #print(obj_txt) f = open(args.o, 'w', encoding='utf-8') json.dump(obj_txt, f, ensure_ascii=False, indent=4, sort_keys=False) f.close() print('** done **') tokenizer = Tokenizer('nltk', '../../tokenizer/e2e.model') txt = nlg_txt mr = mr_obj if read_mr_obj['name'] != '': txt = txt.replace(read_mr_obj['name'], 'NAME') mr['name'] = 'NAME' if read_mr_obj['near'] != '': txt = txt.replace(read_mr_obj['near'], 'NEAR') mr['near'] = 'NEAR' mr_token = tokenizer.mr(mr) txt_token = tokenizer.txt(txt) ''' print(mr_token) print(str(len(mr_token)))
import pickle from bm25 import BM25 # Special vocabulary symbols - we always put them at the start. _PAD = b"_PAD" _GO = b"_GO" _EOS = b"_EOS" _UNK = b"_UNK" _START_VOCAB = [_PAD, _GO, _EOS, _UNK] PAD_ID = 0 GO_ID = 1 EOS_ID = 2 UNK_ID = 3 default_tokenizer = Tokenizer(_UNK) bpe_tokenizer = BPETokenizer( open("/home/martin/projects/subword-nmt/vocab_bpe_merged"), _START_VOCAB) def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, dataset_reader, tokenizer=default_tokenizer, persist_counts=False): """Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size.