def endElement(self, tag): """ Signals the end of an element in non-namespace mode. """ if tag == "title": # initialize a new document with title ==self.title self.tokenizer = Tokenizer(self.title) self.tokenizer.set_title(self.title) elif tag == "text": # By now the document title and id fields must have been extracted Helpers.docid_docname_map[ self.tokenizer.get_doc_id()] = self.tokenizer.get_title() # add text body to that document # TODO: use append termid_freq_map = self.tokenizer.tokenize(self.text) # print("term_termid_map", Helpers.term_termid_map) for term in termid_freq_map: # accumulate (termid: docid) pairs Indexer.termid_docid_list.append( (term, self.tokenizer.get_doc_id())) elif tag == "id" and not self.insideRevision: # DoNOT set id if inside <revision> <id>XXX</id> self.tokenizer.set_doc_id(self.id) elif tag == "revision": self.insideRevision = False # </revision> encountered self.tag = None
def __init__(self, text): """ :param text: client input """ self.text = text self.tokenizer = Tokenizer(self.text) self.parser = Parser(self.tokenizer.create_tokens()) self.GLOBAL_VARS = dict()
def process(filename): with open(filename) as file: text = file.read() if not text: raise Exception("Cannot read text from file") tokenizer = Tokenizer(text) parser = Parser(tokenizer.create_tokens()) tree = parser.parse() symbol_table_builder = SemanticAnalyzer() symbol_table_builder.visit(tree)
def test_returns_tokens_until_exhausted(self): expected = [ Token('class', 'keyword', 1, 1), Token('CorrectSyntax', 'identifier', 1, 7), Token('{', 'symbol', 1, 21), Token('field', 'keyword', 2, 3), Token('String', 'identifier', 2, 9), Token('bar', 'identifier', 2, 16), Token(';', 'symbol', 2, 19), Token('constructor', 'keyword', 4, 3), Token('CorrectSyntax', 'identifier', 4, 15), Token('new', 'identifier', 4, 29), Token('(', 'symbol', 4, 32), Token(')', 'symbol', 4, 33), Token('{', 'symbol', 4, 35), Token('let', 'keyword', 5, 5), Token('bar', 'identifier', 5, 9), Token('=', 'symbol', 5, 13), Token('Hello world!', 'stringConst', 5, 15), Token(';', 'symbol', 5, 29), Token('return', 'keyword', 6, 5), Token('this', 'keyword', 6, 12), Token(';', 'symbol', 6, 16), Token('}', 'symbol', 7, 3), Token('method', 'keyword', 9, 3), Token('void', 'keyword', 9, 10), Token('greetings', 'identifier', 9, 15), Token('(', 'symbol', 9, 24), Token(')', 'symbol', 9, 25), Token('{', 'symbol', 9, 27), Token('do', 'keyword', 10, 5), Token('Output', 'identifier', 10, 8), Token('.', 'symbol', 10, 14), Token('printString', 'identifier', 10, 15), Token('(', 'symbol', 10, 26), Token('bar', 'identifier', 10, 27), Token(')', 'symbol', 10, 30), Token(';', 'symbol', 10, 31), Token('return', 'keyword', 11, 5), Token(';', 'symbol', 11, 11), Token('}', 'symbol', 12, 3), Token('}', 'symbol', 13, 1) ] filename = TEST_FILES / 'CorrectSyntax.jack' tokenizer = Tokenizer(filename) tokens = [] while not tokenizer.finished(): token = tokenizer.next() if not token: break tokens.append(token) self.assertEqual(tokens, expected)
def __init__(self, trec_dir, stopword_file): parser = Parser() for filename in os.listdir(trec_dir): try: with open(os.path.join(trec_dir, filename), 'r') as f: parser.load(f.read()) except Exception as e: pass self.docs = parser.docs with open(stopword_file, 'r') as f: stopwords = f.readlines() self.tokenizer = Tokenizer(stopwords)
class DictionaryBuilder(BaseBuilder): def __init__(self): super().__init__() self.dictionary = set() self.count = 0 self.tokenizer = Tokenizer() def run(self, input_dir_path, output_path): self.load_files(input_dir_path) self.build() self.save(output_path) self.print_counts() def build(self): for file in self.files: with open(file, 'r') as f: lines = f.readlines() words = self.tokenizer.format_data(lines) self.count += len(words) self.dictionary.update(words) def save(self, output_path): with open(output_path, 'w') as f: f.write('\n'.join(sorted(self.dictionary))) def print_counts(self): print('Total count: ', self.count) print('Dictionary count: ', len(self.dictionary))
def __init__(self, tokenizer_path: str, model: UnifiedTransformer, max_turns: int = 64, device: Optional[str] = None): self.tokenizer = Tokenizer(tokenizer_path=tokenizer_path) self.batch_preparer = BatchPreparing( sep_index=self.tokenizer.sep_index, context_index=self.tokenizer.context_index, pad_index=self.tokenizer.pad_index) self.model = model self.max_turns = max_turns if device is None: self.device = next(self.model.parameters()).device else: self.device = torch.device(device) self.model.to(self.device) self.model.eval() self.model.set_seq2seq()
def _search_word(self, word) -> Set[int]: term_id = self.term_map.get(Tokenizer.normalize(word)) if term_id is None: return set() term_info = self.term_dic[term_id] ri_offset = term_info[2] return set(self.r_indices[ri_offset])
def main(args): set_seed(args.seed) Path(args.model_dir).mkdir(parents=True, exist_ok=True) tk = Tokenizer(args.tokenizer) model = TransformerModel(d_model=32, d_ff=64, dropout=.0, layers=3, heads=4, d_emb=-1, pad_token_id=tk.pad_id, vocab_size=tk.vocab_size) ds = Arithmetic(args.data) print( f'model size = {sum(p.numel() for p in model.parameters() if p.requires_grad)/1024/1024:.2f} M trainable parameters' ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train(model=model, dataset=ds, batch_size=args.batch_size, device=device, tokenizer=tk, epochs=args.epochs, model_dir=args.model_dir, save_epoch=args.save_epoch, summary_step=args.summary_step, lr=args.lr)
def compile_all_jack_files(jack_files): try: os.mkdir("xml-export") except: pass for jack_file in jack_files: with open(jack_file) as file: code = remove_all_comments(file.read()) tokenizer = Tokenizer(code) tokenizer.export_xml(jack_file.replace(".jack", "")) try: parser = Parser(tokenizer) parser.export_xml(jack_file.replace(".jack", "")) except Exception as e: print(e) exit()
def parse(self): """ Replace sentiment with 2: Positive, 1: Neutral, 0: Negative """ tk = Tokenizer(preserve_case=False) with open(self.trainingPath) as training: tsvRead = csv.reader(training, delimiter="\t") enum = {'positive': 2, 'neutral': 1, 'negative': 0, 'unknown':3} tweet_dict = {} for line in tsvRead: if tk.tokenize(line[1]): phrase = tk.tokenize(line[1]) for i,word in enumerate(phrase): if i>50 and word in ["neutral","positive","negative","unknown"]: phrase = phrase[:i] break self.data.append({'Sentiment' : enum[line[0]], 'Tweet' : phrase})
def test_skips_rest_of_line_after_line_comment(self): expected = [ Token('let', 'keyword', 1, 1), Token('foo', 'identifier', 1, 5), Token('=', 'symbol', 1, 9), Token('5', 'intConst', 1, 11), Token(';', 'symbol', 1, 12) ] filename = TEST_FILES / 'IgnoreLineComment.jack' tokenizer = Tokenizer(filename) tokens = [] while not tokenizer.finished(): token = tokenizer.next() if not token: break tokens.append(token) self.assertEqual(tokens, expected)
def test_skips_everything_inbetween_multiline_comment(self): expected = [ Token('let', 'keyword', 5, 5), Token('foo', 'identifier', 5, 9), Token('=', 'symbol', 5, 13), Token('5', 'intConst', 5, 15), Token(';', 'symbol', 5, 16) ] filename = TEST_FILES / 'IgnoreMultilineComment.jack' tokenizer = Tokenizer(filename) tokens = [] while not tokenizer.finished(): token = tokenizer.next() if not token: break tokens.append(token) self.assertEqual(tokens, expected)
def build_index(): global Index, Header tokenizer = Tokenizer() for subdir in os.listdir(config.RAW_WEBPAGES): full_subdir = os.path.join(config.RAW_WEBPAGES, subdir) if os.path.isdir(full_subdir): to_parse = read_directory(full_subdir) print("Subdirectory: ", subdir) for _file in tqdm(to_parse): filename = "/".join(_file.split("/")[1:]) header, txt = parse(_file) Header[filename] = header token_counter = tokenizer.counter_tokenize(txt) for tok in token_counter: if tok not in Index: Index[tok] = {filename: token_counter[tok]} else: Index[tok][filename] = token_counter[tok] save_index() save_header()
class IdentityMatrixBuilder(BaseBuilder): def __init__(self, dict_path): super().__init__() self.dict_path = dict_path self.dictionary = set() self.matrix = {} self.tokenizer = Tokenizer() def run(self, input_dir_path, output_path): self.load_dictionary() self.load_files(input_dir_path) self.init_matrix() self.build() self.save(output_path) def load_dictionary(self): with open(self.dict_path, 'r') as f: lines = self.tokenizer.filter_new_lines(f.readlines()) self.dictionary = sorted(lines) def init_matrix(self): self.matrix = {x: [0] * len(self.files) for x in self.dictionary} def build(self): for i, file in enumerate(self.files): with open(file, 'r') as f: lines = f.readlines() words = self.tokenizer.format_data(lines) for word in words: self.matrix[word][i] += 1 def save(self, output_path): field_names = ['Token'] + list( map(lambda x: x.split('/')[-1], self.files)) with open(output_path, 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(field_names) for k, v in self.matrix.items(): writer.writerow([k] + v)
def run(src: str) -> None: global env #tokenization tkz = Tokenizer() tokens, err = tkz.tokenize(src) if tok_debug: for i in tokens: print(i) if display_errors(err, "LOX: SYNTAX ERROR"): return #don't send single EOF token to parser #this allows parser to make stricter assertions while generating the AST if tokens[0].type == TokenType.EOF: return #parsing prs = Parser() program, err = prs.parse(tokens) if parse_debug: for tree in program: print(tree) if display_errors(err, "LOX: GRAMMAR ERROR"): return #interpretation itr = Interpreter(env) exit_status, err, env = itr.interpret(program) display_errors(err, "LOX: RUNTIME ERROR") if env_debug: print(env.map)
def main(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument("prop", help="Propositional statement") arg_parser.add_argument("--tokens", "-t", action="store_true") arg_parser.add_argument("--ast", "-a", action="store_true") arg_parser.add_argument("--symbols", "-s", action="store_true") arg_parser.add_argument("--truthtable", "-tt", nargs='?', const=1, type=int) args = arg_parser.parse_args() if not args.prop: args.prop = " " error_collector = ErrorCollector(args.prop) symbol_table = SymbolTable() tokenizer = Tokenizer(error_collector, symbol_table) parser = Parser(tokenizer, error_collector) ast = parser.parse() # Para hacer debug jeje if args.symbols: symbol_table.show_symbols() if error_collector.has_errors(): error_collector.show_errors() return if args.tokens: tokenizer.show_tokens() if args.ast: print(ast) if args.truthtable: tt = TruthTable(ast, symbol_table, args.truthtable) tt.show()
def test_throws_if_next_invoked_while_status_is_FINISHED(self): mock_func = Mock() tokenizer = Tokenizer('') tokenizer._generator = mock_func tokenizer._status = Tokenizer.EStatus.FINISHED with self.assertRaisesRegex(TokenizerError, '(f|F)inished'): tokenizer.next() self.assertFalse(mock_func.called)
class TermProvider(object): def __init__(self, trec_dir, stopword_file): parser = Parser() for filename in os.listdir(trec_dir): try: with open(os.path.join(trec_dir, filename), 'r') as f: parser.load(f.read()) except Exception as e: pass self.docs = parser.docs with open(stopword_file, 'r') as f: stopwords = f.readlines() self.tokenizer = Tokenizer(stopwords) def __iter__(self): return self.tokenizer.iter_terms(self.docs)
def test_throws_if_string_constant_has_missing_end_double_quote(self): filename = TEST_FILES / 'MalformedString.jack' tokenizer = Tokenizer(filename) line = 5 character = 15 for i in range(16): tokenizer.next() with self.assertRaisesRegex( TokenizerError, f'(L|l)ine.*{line}.*(C|c)haracter.*{character}'): tokenizer.next()
def test_throws_if_string_constant_has_unescaped_double_quote_within(self): filename = TEST_FILES / 'UnescapedDoubleQuotesInString.jack' tokenizer = Tokenizer(filename) line = 5 character = 15 for i in range(16): tokenizer.next() with self.assertRaisesRegex( TokenizerError, f'(L|l)ine.*{line}.*(C|c)haracter.*{character}'): tokenizer.next()
def run(self): if self.config.download: logger.info('Download') collector.download() if self.config.train_bpe: logger.info('Train BPE') collector.train_bpe() self.tokenizer = Tokenizer(tokenizer_path=self.bpe_model_path, need_bos=True, need_eos=True, sep_token=self.config.sep_token, context_token=self.config.context_token) if self.config.collect_data: logger.info('Parse data') self.make_dir(self.train_dir, override=True) self.make_dir(self.validation_dir, override=True) collector.collect()
def main(args): set_seed(args.seed) Path(args.model_dir).mkdir(parents=True, exist_ok=True) tk = Tokenizer(args.tokenizer) model = TransformerModel( d_model=768, d_ff=1024, dropout=args.dropout, layers=args.layer, heads=args.heads, d_emb=-1, pad_token_id=tk.pad_id, vocab_size=tk.vocab_size ) ds = NewsDataset(args.data, args.alpha, args.beta, inplace=args.inplace, sample=args.sample, seed=args.seed) print( f'model size = {sum(p.numel() for p in model.parameters() if p.requires_grad)/1024/1024:.2f} M trainable parameters') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train( model=model, dataset=ds, batch_size=args.batch_size, device=device, tokenizer=tk, epochs=args.epochs, model_dir=args.model_dir, save_epoch=args.save_epoch, summary_step=args.summary_step, lr=args.lr, warnup=args.warnup) torch.cuda.empty_cache()
def main(args): set_seed(args.seed) tk = Tokenizer(args.tokenizer) ds = Arithmetic(args.data) dl = torch.utils.data.DataLoader(dataset=ds, batch_size=args.batch_size, shuffle=False, collate_fn=ds.get_collate_fn(tk)) model = TransformerModel(d_model=32, d_ff=64, dropout=.0, layers=3, heads=4, d_emb=-1, pad_token_id=tk.pad_id, vocab_size=tk.vocab_size) device = torch.device(args.device) model.load_state_dict(torch.load(args.ckpt, map_location=device)['model']) model.to(device) start = timeit.default_timer() total = 0 tp = 0 for x, y in tqdm(dl): p = beam_search_v2(model, x, tk, lambda b, nx, ny: (nx + ny) * b > 4096 * 6 * 64, 1, device, 10) r = list(map(lambda i: i[0] == i[1], zip(p, y))) total += len(r) tp += np.count_nonzero(r) print((timeit.default_timer() - start)) print(tp / total)
class TestTokenizer(unittest.TestCase): """Tests The utility class methods""" def setUp(self): self.text_input = [ 'Hi! , How are you?', 'How everything is going?', 'how old are you' ] self.tokenizer = Tokenizer() def test_tokenizer_dictionary_contains_special_tokens(self): """test that creation creation, dictionary contains the <sos>, <eos> <pad> and <unk> tokens""" expected_num_dictionary_items = len( ['<sos>', '<eos>', '<pad>', '<unk>']) self.assertEqual(expected_num_dictionary_items, self.tokenizer.dictionary_size) def test_text_to_number_works_without_trimming(self): """tests the methods properly splits the input sequence.""" self.tokenizer.fit_on_text(self.text_input, min_keep_frequency=0) expected_dictionary_size = 14 self.assertEqual(expected_dictionary_size, self.tokenizer.dictionary_size) def test_text_to_number_works_with_trimming(self): """tests the methods properly splits the input sequence.""" self.tokenizer.fit_on_text(self.text_input, min_keep_frequency=3) expected_dictionary_size = 5 self.assertEqual(expected_dictionary_size, self.tokenizer.dictionary_size) def test_text_to_numbers(self): """tests tokenizer converts text into numbers""" input_text = ['how are you?'] self.tokenizer.fit_on_text(input_text) text_indexes = self.tokenizer.convert_text_to_number(input_text) expected = [[4, 5, 6, 7, 2]] comparison = expected == text_indexes self.assertTrue(all(comparison[0])) def test_numbers_to_text(self): """tests tokenizer converts text into numbers""" input_text = ['pytorch is awesome'] self.tokenizer.fit_on_text(input_text) text = self.tokenizer.convert_number_to_text([4, 5, 6, 2]) expected = input_text[0] self.assertEqual(expected, text) def test_filter_is_filtering_long_sentences(self): """testes the filter function removes the long token jointly together from both sources and targets""" source_numbers = [[1, 4], [4, 5, 6], [9]] target_numbers = [[11, 22, 33, 44], [44, 55], [88, 99, 100, 110]] filtered_sources, filtered_targets = self.tokenizer.filter( source_numbers, target_numbers, max_token_size=3, remove_unknown=False) expected_source = [[4, 5, 6]] expected_targets = [[44, 55]] self.assertListEqual(expected_source[0], filtered_sources[0]) self.assertEqual(expected_targets[0], filtered_targets[0]) def test_filter_removes_token_containing_unknown_token_index(self): """testes the filter function removes with unknown tokens """ unknown_index = self.tokenizer.unknown_index source_numbers = [[1, unknown_index], [4, 5], [9]] target_numbers = [[11, 22, 33], [44, unknown_index], [88, 99, 100]] filtered_sources, filtered_targets = self.tokenizer.filter( source_numbers, target_numbers, max_token_size=3, remove_unknown=True) expected_source = [[9]] expected_targets = [[88, 99, 100]] self.assertListEqual(expected_source[0], filtered_sources[0]) self.assertEqual(expected_targets[0], filtered_targets[0])
def __init__(self): super().__init__() self.tokenizer = Tokenizer() self.index = defaultdict(list)
lambda x: (x[0], float(x[1])), map(lambda x: x.strip().split('\t'), filter(len, open('data/t2.1.vocab').readlines())))) w, f = zip(*d) f = np.array(f) print(np.exp(f[39:]).sum()) l = list(map(len, w[39:])) freq = sorted(dict(Counter(l)).items(), key=lambda x: x[0]) print('\n'.join(f'|{i}|{j}|' for i, j in freq)) # In[] ds = NewsDataset('data/news_dataset_tag10_v2.1.db') # ds = NewsDataset('data/wiki.db') tk = Tokenizer('data/t2.1_c1') # In[] from src.utils import peek d = peek(ds.data, 1) print(d[0][2]) print(tk.detokenize(tk.tokenize(d[0][2]))) # In[] ll = list(map(lambda x: len(x[2]), ds.data)) sl = sorted(ll) print(sl[0]) print(sl[int(len(sl) * 0.25)]) print(sl[int(len(sl) * 0.5)]) print(sl[int(len(sl) * 0.75)]) print(sl[-1])
def parse(code): return Parser(Tokenizer(code).run()).run()
arg_parser.add_argument('--prefix', type=str, default='char_level_gru') arg_parser.add_argument('--metric_name', type=str, default='val_ppl') arg_parser.add_argument('--device', choices=['cuda', 'cpu'], default='cpu') arg_parser.add_argument('--seed', type=int, default=42) args = arg_parser.parse_args() seed_all(args.seed) print('########################################') print('Load data') train_data = WordsDataset._read_data(args.train_path) valid_data = WordsDataset._read_data(args.valid_path) print(f'Train size: {len(train_data)}\nValid size: {len(valid_data)}') print('########################################') tokenizer = Tokenizer(train_data) print('########################################') print('Build datasets') train_dataset = WordsDataset(train_data, tokenizer, max_length=args.max_length) valid_dataset = WordsDataset(valid_data, tokenizer, max_length=args.max_length) print('########################################') print('Build dataloaders') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
def setUp(self): self.text_input = [ 'Hi! , How are you?', 'How everything is going?', 'how old are you' ] self.tokenizer = Tokenizer()
def tokenizeUserInput(filmReviewList): print("[2] Tokenization of the Token Review Started") tokenList = Tokenizer.doTokenization(filmReviewList) print("[2] Tokenization of the Review comments completed. Tokens: "+str(tokenList)) return tokenList