def __init__(self, input_dir, output_dir, cui_vocab_size, code_vocab_size): """Construct it""" self.input_dir = input_dir self.output_dir = output_dir if os.path.isdir(model_dir): shutil.rmtree(model_dir) os.mkdir(model_dir) # encounter ids -> icd code sets self.enc2codes = {} diag_code_path = os.path.join(self.output_dir, diag_icd_file) proc_code_path = os.path.join(self.output_dir, proc_icd_file) cpt_code_path = os.path.join(self.output_dir, cpt_code_file) self.index_codes(diag_code_path, 'ICD9_CODE', 'diag', 3) self.index_codes(proc_code_path, 'ICD9_CODE', 'proc', 2) self.index_codes(cpt_code_path, 'CPT_NUMBER', 'cpt', 5) # index inputs (cuis) self.input_tokenizer = tokenizer.Tokenizer( n_words=None if cui_vocab_size == 'all' else int(cui_vocab_size), lower=False) self.tokenize_input() # index outputs (codes) self.output_tokenizer = tokenizer.Tokenizer( n_words=None if code_vocab_size == 'all' else int(code_vocab_size), lower=False) self.tokenize_output()
def test_api(self): response = tokenizer.Tokenizer('@ Wikipedia summary: term: Ford limit: 3 key: z') self.assertEquals(response.api, 'wikipedia') response = tokenizer.Tokenizer('@Yelp search: longlat: true near: 40.74503998,-73.99879607 category: Pizza key: z limit: 1') self.assertEquals(response.api, 'yelp') response = tokenizer.Tokenizer('@ Maps directions: from: chapel hill, NC to: Durham, NC key: z mode: walking') self.assertEquals(response.api, 'maps')
def test_arguments_dict(self): response = tokenizer.Tokenizer('@ Wikipedia : term: Ford limit: 3 key: z') self.assertEquals(len(response.arguments_dict), 4) self.check_arguments(response.arguments_dict, ['term', 'limit', 'key']) response = tokenizer.Tokenizer('@Yelp search: longlat: true near: 40.74503998,-73.99879607 category: Pizza key: z limit: 1') self.assertEquals(len(response.arguments_dict), 6) self.check_arguments(response.arguments_dict, ['longlat', 'near', 'category', 'key', 'limit']) response = tokenizer.Tokenizer('@ yelp search : key: f near: Durham, NC category: Mexican') self.assertEquals(len(response.arguments_dict), 4) self.check_arguments(response.arguments_dict, ['near', 'category', 'key'])
def test_program(self): response = tokenizer.Tokenizer('@ Wikipedia : term: Ford limit: 3 key: z') self.assertEquals(response.program, 'default') response = tokenizer.Tokenizer('@ Wikipedia summary: term: Ford limit: 3 key: z') self.assertEquals(response.program, 'summary') response = tokenizer.Tokenizer('@ Wikipedia search: term: Ford limit: 3 key: z') self.assertEquals(response.program, 'search') response = tokenizer.Tokenizer('@ Maps directions: from: chapel hill, NC to: Durham, NC key: z mode: walking') self.assertEquals(response.program, 'directions') response = tokenizer.Tokenizer('@ Maps 342dfs: from: chapel hill, NC to: Durham, NC key: z mode: walking') self.assertEquals(response.program, '342dfs')
def identifier_token(self, parser_list, var_list): code = "" identifier_cons = tokenizer.Tokenizer(parser_list[0]) identifier_bool, length = identifier_cons.identifier(parser_list[0]) if (identifier_bool): current_identifier = parser_list.pop(0) if (current_identifier in var_list): found_variable = var_list.index(current_identifier) var_ret = "local[" + str(found_variable) + ']' code += current_identifier return parser_list, code, var_list, var_ret current_identifier_glb_form = current_identifier + '*' if (current_identifier_glb_form in var_list): found_variable = var_list.index(current_identifier_glb_form) var_ret = "global[" + str(found_variable) + ']' code += current_identifier return parser_list, code, var_list, var_ret new_variable = self.new_var(var_list) var_list.append(current_identifier) id_left = new_variable code += current_identifier return parser_list, code, var_list, id_left raise Exception('Fail')
def input_fn(seq_len=100, batch_size=512): """Return a dataset of source and target sequences for training.""" with io.open(SHAKESPEARE_TXT, encoding='utf-8', errors='ignore') as f: txt = f.read().lower() #txt = txt.replace('. ',' ') txt = re.sub(r'\(cid:[0-9]{1,3}\)', '', txt) txt = re.sub(r'\\x[a-z0-9]{2}', '', txt) my_tokenizer = tokenizer.Tokenizer(exceptions=[]) tokens = my_tokenizer.split(txt, False, '. ') # txt = [c for c in txt if 0 < ord(c) < 255 or 894 <= ord(c) <= 974 ] # with tf.io.gfile.GFile(SHAKESPEARE_TXT, 'r',encoding='ISO-8859-1utf-8') as f: # txt = f.read() global words_indices global indices_words words_indices, indices_words = get_all_words(tokens) print(len(words_indices)) source = tf.constant(transform(tokens, words_indices), dtype=tf.int32) ds = tf.data.Dataset.from_tensor_slices(source).batch(seq_len + 1, drop_remainder=True) BUFFER_SIZE = 10000 ds = ds.map(split_input_target).shuffle(BUFFER_SIZE).batch( batch_size, drop_remainder=True) return ds.repeat()
def number(self, parser_list): number_cons = tokenizer.Tokenizer(parser_list[0]) bool_number, length = number_cons.number(parser_list[0]) if (bool_number): parser_list.pop(0) return parser_list raise Exception('Fail')
def create_list_object(liststr, verbose=None): namespacelist = list() # The default. if '::' in liststr: # Do this for good measure. Allows this: liststr = liststr.replace('::', ' :: ') # all[1 2::x y]ok! import tokenizer Tsub = tokenizer.Tokenizer() if verbose: Tsub.diagnostic_on() Tsub.tokenize_input(liststr) tokenQ = Tsub.get_token_queue() if '::' in tokenQ: if tokenQ.count('::') > 1: raise IOError("Only one namespace section (::) allowed!") else: # Must be the correctly specified one. c = 0 # Counter for t in tokenQ: # Loop through tokens of this list content. c += 1 # Increment counter if '::' == t: # Found a namespace separator? namespacelist = tokenQ[c:] del tokenQ[c - 1:] object_sublist = list() for token in tokenQ: if not token is None: # Avoid the `None` that pops up with [], etc. object_sublist.append(objectify(token)) # Possibly recursive! L = StackOB_LST(object_sublist) if '!' in namespacelist: L.dead = False #L.names= ['!'] # These are important now. Keep them. else: L.dead = True L.names = namespacelist return L
def parse(self, text): tokens = tokenizer.Tokenizer(text, self.ui) while (tokens.hasTokens()): if (Callback.peek(tokens)): self.constructs.append(Callback(tokens, parser = self)) elif (Interface.peek(tokens)): self.constructs.append(Interface(tokens, parser = self)) elif (Mixin.peek(tokens)): self.constructs.append(Mixin(tokens, parser = self)) elif (Namespace.peek(tokens)): self.constructs.append(Namespace(tokens, parser = self)) elif (Dictionary.peek(tokens)): self.constructs.append(Dictionary(tokens, parser = self)) elif (Enum.peek(tokens)): self.constructs.append(Enum(tokens, parser = self)) elif (Typedef.peek(tokens)): self.constructs.append(Typedef(tokens, parser = self)) elif (Const.peek(tokens)): # Legacy support (SVG spec) self.constructs.append(Const(tokens, parser = self)) elif (ImplementsStatement.peek(tokens)): self.constructs.append(ImplementsStatement(tokens, parser = self)) elif (IncludesStatement.peek(tokens)): self.constructs.append(IncludesStatement(tokens, parser = self)) else: self.constructs.append(SyntaxError(tokens, None, parser = self))
def main(): new_tweet_generator = tweet_generator("../Hillary_Trump_24_8_2016.dat") keep_going = True count = 0 tok = tokenizer.Tokenizer(preserve_case=False) #order: hilary, trump, both, neither names_count = [0, 0, 0, 0] while keep_going: try: count += 1 whois = hillary_or_trump(next(new_tweet_generator), tok) print type(whois) if whois == 'h': names_count[0] += 1 elif whois == 't': names_count[1] += 1 elif whois == 'b': names_count[2] += 1 else: names_count[3] += 1 print names_count except StopIteration: print 'looks like the end of file' keep_going = False except Exception, e: print str(e)
def searchQuery(self, query): """ Method for searching a query in the database Returns dictionary with filenames and positions of all tokens from the query @param query: a string of tokens to be found @return: dictionary of filenames as keys and positions of tokens as values """ if not (isinstance(query, str)): raise ValueError rawTokens = tokenizer.Tokenizer().genclasstokenize(query) tokens = indexator.Indexator.relevancyFilter(rawTokens) responds = [ ] # list of responds of search function for each token in the query for token in tokens: responds.append(self.search(token.string)) files = set(responds[0].keys()) # set of filenames from first respond for d in responds[1:]: # intersection of all filenames in all responds files.intersection_update(set(d.keys())) resultDict = {} for file in files: resultDict[file] = [] # for every file in intersection of files for d in responds: # write as values all positions from intersection of files resultDict[file] += d.get(file, []) return resultDict
def run(self): # parse command line arguments parser, arguments = self.parse_arguments() if len(sys.argv) == 2 and (sys.argv[1] == '-h' or sys.argv[1] == '--help'): parser.print_help() return elif len(sys.argv) < 3: raise RuntimeError("ERROR: URL and HTML_TAG are required") config = self.config(arguments) url, tag = sys.argv[1], sys.argv[2] print("Web Scraping with url={} tag={} next={} max={}"\ .format(url, tag, config.next, config.max if config.max > 0 else 'infinite')) # tokenize HTML_TAG t = tokenizer.Tokenizer() tag_info = t.tokenize(tag) # scrape HTML_TAG in URL s = scraper.Scraper() crawled_data = s.crawl(url, tag_info, config.next, config.max) # print out data for data in crawled_data: print(data)
def normalizedMethodName(self, methodText, interfaceName=None): match = re.match(r'(.*)\((.*)\)(.*)', methodText) if (match): tokens = tokenizer.Tokenizer(match.group(2)) if (ArgumentList.peek(tokens)): arguments = ArgumentList(tokens, None) return match.group(1) + '(' + ', '.join( [argument.name for argument in arguments]) + ')' name = match.group(1) + match.group(3) arguments = match.group(2) else: name = methodText arguments = '' if (interfaceName): interface = self.find(interfaceName) if (interface): method = interface.findMethod(name) if (method): return method.methodName return name + '(' + arguments + ')' for construct in self.constructs: method = construct.findMethod(name) if (method): return method.methodName construct = self.find(name) if (construct and ('method' == construct.idlType)): return construct.methodName return name + '(' + arguments + ')'
def tweet_tokenizer(to_tokenize): """ uses the tokenizer.py tokenize stuff, returns a tuple of the text of the thing, with case not preserved """ tok = tokenizer.Tokenizer(preserve_case=False) return (tuple(tok.tokenize(to_tokenize)))
def string_token(self, parser_list): string_cons = tokenizer.Tokenizer(parser_list[0]) string_bool = string_cons.string(parser_list[0]) if (string_bool): parser_list.pop(0) return parser_list raise Exception('Fail')
def __init__(self, lang_code, rev): tk_file_path = 'dataset/{}/{}.tokens'.format(lang_code, lang_code) # Tokenizer self.tokenizer = tokenizer.Tokenizer(tk_file_path, rev) try: train_dataset_path = 'dataset/{}/{}-train.csv'.format( lang_code, lang_code) self.train_dataset = pd.read_csv(train_dataset_path, header=None).values.tolist() except: print('Cannot open file: ', train_dataset_path) exit() try: val_dataset_path = 'dataset/{}/{}-val.csv'.format( lang_code, lang_code) self.val_dataset = pd.read_csv(val_dataset_path, header=None).values.tolist() except: print('Cannot open file: ', val_dataset_path) exit() try: test_dataset_path = 'dataset/{}/{}-test.csv'.format( lang_code, lang_code) self.test_dataset = pd.read_csv(test_dataset_path, header=None).values.tolist() except: print('Cannot open file: ', test_dataset_path) exit() self.inp_vocab_size = self.tokenizer.inp_vocab_size self.tar_vocab_size = self.tokenizer.tar_vocab_size
def mted(tokenPath, sources, compress): tok = tokenizer.Tokenizer(tokenPath) functions = tok.split_functions(False) # sort them appropriately def comp(a,b): lena = len(a[1]) lenb = len(b[1]) if lena == lenb: # if lengths are tied, sort alphabetically based on function name if a[0] < b[0]: return -1 else: return 1 else: return lena - lenb functions.sort(comp) # compress and output results = "" for funct in functions: if funct[2] in sources: if compress: results += tokenizer.compress_tokens(funct[1]) else: results += " ".join(funct[1]) if compress == False: results += " " # return results return results.strip()
def normalizedMethodNames(self, methodText, interfaceName = None): match = re.match(r'(.*)\((.*)\)(.*)', methodText) if (match): tokens = tokenizer.Tokenizer(match.group(2)) if (ArgumentList.peek(tokens)): arguments = ArgumentList(tokens, None) return [match.group(1).strip() + '(' + argumentName + ')' for argumentName in arguments.argumentNames] name = match.group(1).strip() + match.group(3) argumentNames = [argument.strip() for argument in match.group(2).split(',')] else: name = methodText argumentNames = None if (interfaceName): interface = self.find(interfaceName) if (interface): methods = interface.findMethods(name, argumentNames) if (methods): return list(itertools.chain(*[method.methodNames for method in methods])) return [name + '(' + ', '.join(argumentNames or []) + ')'] for construct in self.constructs: methods = construct.findMethods(name, argumentNames) if (methods): return list(itertools.chain(*[method.methodNames for method in methods])) construct = self.find(name) if (construct and ('method' == construct.idlType)): return construct.methodNames return [name + '(' + ', '.join(argumentNames or []) + ')']
def identifier_token(self, parser_list): identifier_cons = tokenizer.Tokenizer(parser_list[0]) identifier_bool, length = identifier_cons.identifier(parser_list[0]) if (identifier_bool): parser_list.pop(0) return parser_list raise Exception('Fail')
def __init__(self, fileName): self.bookKeeperName = fileName self.book = {} self.index = {} self.tk = tokenizer.Tokenizer() self.total = 0 self.indexSize = 0
def removeStopWordsFromData(self, tickets, targets): # Tokenize data tok = tk.Tokenizer(tickets) tickets_no_sw, targets_no_sw = tok.removeStopWords(tickets, targets) # create the array of words from all the tickets words = tok.extractWords() return tickets_no_sw, targets_no_sw, words
def parse(self): t = tokenizer.Tokenizer() for word in t.get_tokens(normalize(self.file_name)): self.process(word) if self.save: self.dictionary.save() return 0
def __main__(): import tokenizer t = tokenizer.Tokenizer() p = Parser() tokens = t.tokenize('(hello\n (option1)\n option2 -l $HOME)') tokens = t.tokenize('(define hello\n (ls -l))') asts = p.parse(tokens) print(asts)
def string_token(self, parser_list): code = "" string_cons = tokenizer.Tokenizer(parser_list[0]) string_bool = string_cons.string(parser_list[0]) if (string_bool): current_string = parser_list.pop(0) code += current_string return parser_list, code raise Exception('Fail')
def test_disambiguation(self): print '----------------------------------------------------------------------' tokenizator = tokenizer.Tokenizer("RU") tokens = tokenizator.tokenize(u"я купил себе пельменей") disambiguator = Disambiguator("RU") disambiguated = disambiguator.disambiguate(tokens) for index, item in enumerate(disambiguated): print index, item.content, item.label, item.lemma push_disambiguated_to_cpp(disambiguated)
def test_tokenizer(self): test_strings = [ 'The man who is tall is happy.', 'Is the man who is tall happy?' ] tokenized_strings = [[ 'The', 'man', 'who', 'is', 'tall', 'is', 'happy', '.' ], ['Is', 'the', 'man', 'who', 'is', 'tall', 'happy', '?']] T = tokenizer.Tokenizer() assert T.process(test_strings) == tokenized_strings
def number(self, parser_list): code = "" number_cons = tokenizer.Tokenizer(parser_list[0]) bool_number, length = number_cons.number(parser_list[0]) if (bool_number): current_number = parser_list.pop(0) code += current_number return parser_list, code raise Exception('Fail')
def isInGrammar(grammarFile, testString, debug = False): g = grammar.Grammar(grammarFile) alphabet = g.getAlphabet() tokens = tokenizer.Tokenizer(tokenizer.getTTLForAlphabet(alphabet), True) tokens.tokenize(testString) if g.isInLanguage(tokens, debug): print("Test String in Language!") else: print("Test String NOT in Language!")
def assemble(self, asms): self.tokenizer = tokenizer.Tokenizer() token_lines = [self.tokenizer.tokenize(asm) for asm in asms] self.parser = parser.Parser() parsed_lines = self.parser.parse(token_lines) self.symbol_table = symbol_table.SymbolTable() self.symbol_table.generate(parsed_lines) self.generator = generator.Generator() codes = self.generator.generate(parsed_lines, self.symbol_table) return codes
def analyze_file(given_file): """ creates a tokenizer for the given file, extracts all tokens and... :param given_file: a jack file :return: None """ cur_tokenizer = tokenizer.Tokenizer(given_file) engine = CompilationEngine.CompilationEngine(cur_tokenizer) engine.compile()