def worker_process(i, jobs_queue, output_queue, args): if not args.disable_lm_filter: lm_filter = load_lm_filter(args.source_lang, args.target_lang, args.metadata_yaml, args.source_tokenizer_command, args.target_tokenizer_command) else: lm_filter = None if not args.disable_porn_removal: porn_removal = args.porn_removal if args.metadata_yaml['porn_removal_side'] == 'tl': porn_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) else: porn_tokenizer = Tokenizer(args.source_tokenizer_command, args.source_lang) else: porn_removal = None porn_tokenizer = None while True: job = jobs_queue.get() if job: logging.debug("Job {0}".format(job.__repr__())) nblock, filein_name = job ojob = None with open(filein_name, 'r') as filein, NamedTemporaryFile( mode="w", delete=False, dir=args.tmp_dir) as fileout: logging.debug( "Classification: creating temporary filename {0}".format( fileout.name)) for i in filein: parts = i.strip().split("\t") left = "" right = "" if len(parts) >= args.scol and len(parts) >= args.tcol: left = parts[args.scol - 1] right = parts[args.tcol - 1] else: logging.error( "WARNING: scol ({}) or tcol ({}) indexes above column number ({})" .format(args.scol, args.tcol, len(parts))) continue wrong_tu_results = wrong_tu(left, right, args, lm_filter, porn_removal, porn_tokenizer) if wrong_tu_results != False: fileout.write("\t".join(parts) + "\t0") if args.annotated_output: fileout.write("\t{}\n".format(wrong_tu_results)) else: fileout.write("\n") else: fileout.write("\t".join(parts) + "\t1") if args.annotated_output: fileout.write("\tkeep\n") else: fileout.write("\n") ojob = (nblock, fileout.name) filein.close() fileout.close() if ojob: output_queue.put(ojob) os.unlink(filein_name) else: logging.debug("Exiting worker") break
import json from tokenizer import Tokenizer import time if __name__ == "__main__": tokens = Tokenizer() tokens.read_data('./WEBPAGES_RAW/bookkeeping.json') start = time.time() tokens.find_files() tokens.find_single_file("39/373", "mondego.ics.uci.edu/datasets/maven-contents.txt") tokens.compute_tf_idf_and_insert_db() end = time.time() time = end - start full_time = "Hours: " + str(time / 60 / 60) + ", Minutes:" + str(time / 60) print("TOTAL TOKENS: ", tokens.database.total_documents()) print(full_time)
def collate(data: List[str], tokenizer: Tokenizer, block_size: int) -> Batch: ids = tokenizer.encode(data, block_size) mask = tokenizer.mask(ids) return Batch(ids=ids, attention_mask=mask) def build_data_iterator(tokenizer, dataset, batch_size, block_size, random_sampler=False) -> DataLoader: sampler = RandomSampler(dataset) if random_sampler else SequentialSampler( dataset) iterator = DataLoader( dataset, sampler=sampler, batch_size=batch_size, collate_fn=lambda data: collate(data, tokenizer, block_size), ) return iterator if __name__ == "__main__": tokenizer = Tokenizer("tokenizer.model") with open("corpus.txt", encoding="utf-8") as f: dataset = f.readlines() iterator = build_data_iterator(tokenizer, dataset, 8, 128) batch = next(iter(iterator)) print(tokenizer.decode(batch[0]))
def prefix_parser(str_statement): tokenizer = Tokenizer(str_statement) return prefix_parser_recursive(tokenizer)
print('joke:\n', joke) print() print('generated explanation:\n', output) print() if true_output_text is not None: print('true explanation:\n', true_output_text) print() if __name__ == '__main__': # load dataset print('loading dataset') tokenizer = Tokenizer(dataset_path) vocab_size = tokenizer.vocab_size model_vars = MyModel(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units) model = tf.keras.Model(inputs=[model_vars.encoder_input, model_vars.decoder_input], outputs=model_vars.decoder_output) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate) model.compile(optimizer=optimizer, loss=model_vars.sparse_cross_entropy, target_tensors=[model_vars.decoder_target]) checkpoint_dir = os.path.dirname(checkpoint_path) checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
] UseDetails = True if "-d" in params else False print "STARTED:", str(datetime.now()) start = time.time() morph = get_morph( os.path.join( os.path.dirname(sys.argv[0]), "pydicts").decode("UTF8")) # Подгружаем русский словарь morph_simple = get_morph( os.path.join(os.path.dirname(sys.argv[0]), "pydicts").decode("UTF8"), check_prefixes=False) # Подгружаем русский словарь tok = Tokenizer() # Подгружаем токенизатор tagger = Tagger(morph, morph_simple) # Подгружаем тэггер syner = Synonymizer(morph_simple, params) # Подгружаем синонимизатор print "Synonymizer statistics loaded! It took", time.time() - start # Чтение файла (2 попытки: cp1251 и utf-8) try: text = read_file(filename) except Exception as e: error( "Encoding detection failed! Windows-1251 or UTF-8 without BOM expected.", syner.UseDetails, str(e)) sys.exit() tokens = tok.tokenize(text)
import pickle import sys from neural_network import create_nn from tokenizer import Tokenizer data = pickle.load(open('model.bin', 'rb')) layout = data["layout"] weights = data["weights"] tokenizer = Tokenizer(data["dictionary"]) sentence = ' '.join(sys.argv[1:]) call_nn = create_nn(layout, weights) def wrap_nn(call_nn, tokenizer: Tokenizer): def call(sentence: str): input_layer = tokenizer.input_layer_of_sentence(sentence) output_layer = call_nn(input_layer) return output_layer[0] return call sentence_nn = wrap_nn(call_nn, tokenizer) output = sentence_nn(sentence) bzh = output > 0
def tokenize_raw_text(raw_text: str) -> list: tokenizer = Tokenizer(clean_empty_lines(clean_html(raw_text))) return tokenizer.tokenize()
def test_get_parsed_data(self): token = Tokenizer('../data/input.txt') lines = token.get_parsed_data() self.assertEqual(len(lines), 7)
def __init__(self): self._tokenizer = Tokenizer()
def addx(): x = get_value("_1") + get_value("_2") VARIABLES["_r"] = x return x VARIABLES["add"] = addx def ifx(): if get_value("_1"): exe_func(get_func("_2")) VARIABLES["if"] = ifx def loopx(): func = get_func("_1") continu = True while continu: exe_func(func) continu = get_value("_r") VARIABLES["loop"] = loopx exe_func(ParseMethod(Tokenizer(), False).stmts)
def convertToLaTeX(string): tokenizer = Tokenizer(scanner=Scanner(string)) parser = Parser(tokenizer=tokenizer) return str(parser.parseCode())
from tokenizer import Tokenizer tk = Tokenizer() print(tk.getToken()) tk.changeId() print(tk.getToken())
while i < len(args): arg = args[i] if arg.startswith('-'): if arg == '-o': i += 1 i += 1 continue try: in_file = open(arg) break except IOError: print_error_msg_and_exit(f'Cannot open input file {in_file}') if in_file is sys.stdin: print_error_msg_and_exit(f'No input file') tokenizer = Tokenizer(in_file.read()) try: tokens = tokenizer.all_tokens() analyser = Analyser(tokens) # analyser.c0_ast.draw() elf = analyser.generate() if '-s' in args: out_file.write(elf.generate_s0()) elif '-c' in args: out_file.write(elf.generate_o0()) if '-A' in args: analyser.c0_ast.draw(draw_full_ast=True) elif '-a' in args: analyser.c0_ast.draw(draw_full_ast=False) except (TokenizerException, ParserException, AnalyserException) as e:
def read_text(self, filename): tokenizer = Tokenizer() words = [] sentences = [] with open(filename, 'r', encoding='utf-8') as file: lines = file.read().lower() for line in lines.split('\n'): sentences += tokenizer.split_into_sentences(line) for line in sentences: candidate_words = tokenizer.split_into_words(line) words += self.clean_words(candidate_words) counts = Counter(words) for key in counts: if counts[key] < self.minimal_frequency: self.low_frequency_words.append(key) for key in self.low_frequency_words: del counts[key] print(counts) word_pair_frequencies = {} for sent in sentences: words = tokenizer.split_into_words(sent) words = self.clean_words(words) for word_position in range(len(words)): word = words[word_position] if word not in word_pair_frequencies: word_pair_frequencies[word] = [] start_pos = max(0, word_position - self.window_size) end_pos = min(len(words) - 1, word_position + self.window_size) for second_word_position in range(start_pos, end_pos): second_word = words[second_word_position] distance = abs(second_word_position - word_position) if distance == 0: continue inverse_distance = self.window_size - distance + 1 if second_word != word: word_pair_frequencies[word].append( (second_word, inverse_distance)) for key in word_pair_frequencies.keys(): words = [] word_distances = word_pair_frequencies[key] word_distances_sums = {} distances_sum = 0 for word_distance in word_distances: word = word_distance[0] distance = word_distance[1] if word in word_distances_sums: word_distances_sums[word] = +distance else: word_distances_sums[word] = distance words.append(word) counter = Counter(words) for word in counter: counter[word] *= math.sqrt(word_distances_sums[word]) word_pair_frequencies[key] = counter self.vocab_size = len(counts.keys()) number = 0 for key in counts.keys(): self.vocabulary_encoded[key] = number self.vocabulary.append(key) number += 1 self.target_probabilities = [] for i in range(self.vocab_size): word = self.vocabulary[i] frequencies = word_pair_frequencies[word] probabilities = [] for j in range(self.vocab_size): target_word = self.vocabulary[j] frequency = 0 if target_word in frequencies: frequency = frequencies[target_word] probabilities.append(frequency) self.target_probabilities.append(self.softmax(probabilities))
__author__ = 'Levon' from tree import tree, parseTree,ExpressionError from tokenizer import token, Tokenizer pT = parseTree() tok = Tokenizer() assert(pT.buildParseTree(tok.tokenize("1+2")) == tree('+','1','2')) assert(pT.buildParseTree(tok.tokenize("(x+(y*z+2))-3*((5+x)/2-4)")) == tree('-',tree('+','x',tree('+',tree('*','y','z'),'2')),tree('*','3',tree('-',tree('/',tree('+','5','x'),'2'),'4')))) assert (pT.buildParseTree(tok.tokenize("sin(x)+ln(y)*3")) == tree('+',tree('sin','x'),tree('*',tree('ln','y'),'3'))) assert (pT.buildParseTree(tok.tokenize('x^y*2-3')) == tree('-',tree('*',tree('^','x','y'),'2'),'3')) assert (pT.buildParseTree(tok.tokenize('x=y=5*3-20*sin(x+y)')) == tree('=','x',tree('=','y',tree('-',tree('*','5','3'),tree('*','20',tree('sin',tree('+','x','y'))))))) try: # teste pentru erori Tree = pT.buildParseTree(tok.tokenize('x***y')) assert(False) except ExpressionError: assert(True) try: Tree = pT.buildParseTree(tok.tokenize('x===y')) assert(False) except ExpressionError: assert(True) try: Tree = pT.buildParseTree(tok.tokenize('x+++y')) assert(False) except ExpressionError: assert(True) try: Tree = pT.buildParseTree(tok.tokenize('+x*3')) assert(False) except ExpressionError: assert(True)
def tokenizer(self): if not self.expression: raise ValueError("Empty expression! Cannot process!") expressionLen = len(self.expression) i = 0 number = '' current = '' while (i < expressionLen): current = self.expression[i] if (current == ' ' or current == '\n' or current == '\t'): i = i + 1 continue elif current.isnumeric(): while (current.isnumeric()): number = number + current i = i + 1 if (i >= expressionLen): break current = self.expression[i] if (current == '.'): number = number + current i = i + 1 current = self.expression[i] if current.isnumeric(): while (current.isnumeric()): number = number + current i = i + 1 if (i >= expressionLen): break current = self.expression[i] else: raise ValueError("FLOAT USAGE: [0-9].[0-9][0-9]* ") token = Tokenizer('Number', float(number)) i = i - 1 number = '' elif current == '+': token = Tokenizer('+') elif current == '-': token = Tokenizer('-') elif current == '*': token = Tokenizer('*') elif current == '/': token = Tokenizer('/') elif current == '^': token = Tokenizer('^') elif current == '(': token = Tokenizer('(') elif current == ')': token = Tokenizer(')') elif current == '@': variable = '' i = i + 1 current = self.expression[i] while (current.isalpha()): variable = variable + current i = i + 1 if (i >= expressionLen): break current = self.expression[i] # if : var = Var.getInstance() val = var.getVar(variable) if val == None: raise ValueError('Variable => ' + variable + ' not declared') token = Tokenizer('Number', val[1]) i = i - 1 elif current.isalpha(): variable = '' while (current.isalpha()): variable = variable + current i = i + 1 current = self.expression[i] token = Tokenizer('Variable', variable) i = i - 1 elif current == '=': token = Tokenizer('=') else: raise ValueError("INVALID TOKEN => %s" % current) i = i + 1 if token: self.tokens.append(token)
def test_parser(self): for src_filename in os.listdir(TEST_CASES_DIR): logger.debug("tokenizing {}".format(src_filename)) lst_lines = Tokenizer(os.path.join(TEST_CASES_DIR, src_filename)).tokenize() Parser().parse_tokens(lst_lines)
def classify(self, msg): # read my_freq = Tokenizer(msg, self.tagger, self.common).get_freq() cls = self.__cos_sim(my_freq) return cls
def test_init(self): for src_filename in os.listdir(TEST_CASES_DIR): Tokenizer(os.path.join(TEST_CASES_DIR, src_filename))
from density_calculator import DensityCalculator from tokenizer import Tokenizer from filters import StopwordsFilter calculator = DensityCalculator(Tokenizer(), StopwordsFilter('en')) densities = calculator( ''' To follow along with future lessons it is important that you have the right files and programs in your programming-historian directory. At the end of each lesson in this series you can download the programming-historian zip file to make sure you have the correct code.''' ) print densities
def test_tokenize(self): for src_filename in os.listdir(TEST_CASES_DIR): logger.debug("tokenizing {}".format(src_filename)) Tokenizer(os.path.join(TEST_CASES_DIR, src_filename)).tokenize()
def parse(self, source): t = Tokenizer() return self._parse_statements(peekable(t.tokenize(source)))
def findDeviceDeclarations(self): t = Tokenizer(self.deprocessedFile) for token in t.tokenize(): self.allTokens.append(token) m = Match() self.deviceDclLines = m.match_device_function(self.allTokens)
# 文章的预处理,这里暂不处理 return content def load_sentences(filepath, shuffle=True): readList = [] with open(filepath, 'rb') as f: readList=pickle.load(f) samples = [] for item in readList: samples.append("".join(item[0])) if shuffle: random.shuffle(samples) return samples file = "./static_model/vocabs.json" tokenizer = Tokenizer(mintf, processes) if os.path.exists(file): # X = load_sentences('./train_all_1209.pkl') tokenizer.load_vocab_from_file(file) # tokenizer.load(file, X) # else: # X = load_sentences('./train_all_1209.pkl') # print("tokenize...") # tokenizer.fit_in_parallel(X) # tokenizer.save(file) words = tokenizer.words word2id = tokenizer.word2id id2word = {j:i for i,j in word2id.items()} vocab_size = len(word2id)
def __init__(self, use_stemming=True, remove_stopwords=True): self._tokenizer = Tokenizer(use_stemming, remove_stopwords)
import sys from tokenizer import Tokenizer from syntax_parser import Syntax from grammar_parser import Grammar with open("grammar.txt", "r") as grammar_file: grammar_str = grammar_file.read() with open(sys.argv[1]) as code_file: code_str = code_file.read() grammar = Grammar(grammar_str) tokenizer = Tokenizer(grammar, code_str) syntax = Syntax(grammar, tokenizer) print("Compiled")
def __init__(self, use_compression=True, use_stemming=True, remove_stopwords=True): self.use_compression = use_compression self._tokenizer = Tokenizer(use_stemming, remove_stopwords)
def reverse(self, class_id): return def __bool__(self): return True # test from tokenizer import Tokenizer raw_docs = [ " Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.", " Schwan-STABILO is a German maker of pens for writing colouring and cosmetics as well as markers and highlighters for office use. It is the world's largest manufacturer of highlighter pens Stabilo Boss." " Q-workshop is a Polish company located in Poznań that specializes in designand production of polyhedral dice and dice accessories for use in various games (role-playing gamesboard games and tabletop wargames). They also run an online retail store and maintainan active forum community.Q-workshop was established in 2001 by Patryk Strzelewicz – a student from Poznań. Initiallythe company sold its products via online auction services but in 2005 a website and online store wereestablished." ] if __name__ == '__main__': tokenizer = Tokenizer() emb_vocab = EmbVocabulary(50, tokenizer.tokenizer0) #fit emb_vocab.fit() # get ids for vec in emb_vocab.get_vec(raw_docs): print(vec) print(vec.shape) vec_list = list(emb_vocab.get_vec(raw_docs)) for v in vec_list: print(v)
return vocabulary sentences = np.genfromtxt('./tickets_QIT.txt', delimiter='\n', dtype=str) language = 'italian' max_words = None max_length = 30 # Text preprocessor with no functionalities whatsoever prep = TextPreprocessor(sentences) # Add decorator to clean email bodies prep = QITEmailBodyCleaner(prep) # Add tokenizer decorator prep = Tokenizer(prep, language) # Get intermediate results tokens = prep.preprocess() # Build vocabulary vocabulary = build_vocabulary(tokens, max_words=max_words) # Add integer encoding decorator unknown_token_id = max(vocabulary.values()) + 1 prep = IntegerEncoder(prep, vocabulary, unknown_token_id) # Add padding decorator padding_token_id = max(vocabulary.values()) + 2 prep = Padder(prep, padding_token_id, max_length)