def testAdvanceBeforePeepAheadShouldPass(self): global tokenizer tokenizer=Tokenizer('a b') a=tokenizer.advance() self.assertEqual(valueof(a),'a') b=tokenizer.peepahead() self.assertEqual(valueof(b),'b')
def train(training_items=[], WS="", gazetteer=None, tokenize=False, tokenizer_context=1, left_context=1, right_context=1, alignment_context=1, max_align_dist=1, nearest_lev_hyperballs=1, max_lev_dist=1, min_tok_freq=25, min_lem_freq=25): """Train and save a new model""" ####### train tokenizer #################################################################################### if tokenize: tokenizer = Tokenizer(context=tokenizer_context, WS=WS) tokenizer.train(training_items=training_items, gazetteer=gazetteer) ####### train POS-tagger and lemmatizer ################################################################################################ sequential_tagger = MaxentTagger(WS=WS, left_context=left_context, right_context=right_context, min_tok_freq=min_tok_freq, min_lem_freq=min_lem_freq) sequential_tagger.train(training_items=training_items, gazetteer=gazetteer) sequential_tagger.train_lemmatizer(training_items=training_items, alignment_context=alignment_context, max_align_dist=max_align_dist, nearest_lev_hyperballs=nearest_lev_hyperballs, max_lev_dist=max_lev_dist) return
def printPerWordPerplexity(self,file): t = Tokenizer() print "---------------Perplexity Per Word-----------------------" for line in file: line = t.tokenize(line) perplexity = self.calculatePerWordPerplexity(line) print "%s:\t%f" % (line,perplexity)
def printPerWordPerplexityInterpolated(self,file,lambda_uni,lambda_bi): t = Tokenizer() print "---------------Perplexity Per Word-----------------------" for line in file: line = t.tokenize(line) perplexity = self.calculatePerWordPerplexityInterpolated(line,lambda_uni,lambda_bi) print "%s:\t%f" % (line,perplexity)
def testadvanceMultipleTime(self): global tokenzier tokenizer=Tokenizer('a b c ;') a=tokenizer.advance() self.assertEqual(valueof(a),'a') b=tokenizer.advance() self.assertEqual(valueof(b),'b') c=tokenizer.advance() self.assertEqual(valueof(c),'c')
def testadvanceShouldReturnEndWhenNoMoreToken(self): global tokenzier tokenizer=Tokenizer('a') a=tokenizer.advance() self.assertEqual(valueof(a),'a') end=tokenizer.advance() self.assertEqual(valueof(end),'(end)') end=tokenizer.advance() self.assertEqual(valueof(end),'(end)')
def testAdvanceBeforePeepAheadTwiceShouldPass(self): global tokenizer tokenizer=Tokenizer('a') a=tokenizer.advance() self.assertEqual(valueof(a),'a') end=tokenizer.advance() self.assertEqual(valueof(end),'(end)') end=tokenizer.peepahead() self.assertEqual(valueof(end),'(end)')
def __init__(self, depth): tokenizer = Tokenizer() tokenizer.on_word = lambda x: self.__on_word(x) self.tokenizer = tokenizer root = Node(Shared.make_key([""])) self.nodeByWords = {root.key: root} self.depth = depth self.history = [root] self.wordCount = 0
def printPerWordPerplexityInterpolated(self,file,lu,lb,lt): t = Tokenizer() print "---------------Perplexity Per Word-----------------------" pReturn = [] for line in file: line = t.tokenize(line) perplexity = self.calculatePerWordPerplexityInterpolated(line,lu,lb,lt) print "%s:\t%f" % (line,perplexity) pReturn.append((line,perplexity))
def preprocessing(keywords): '''Preprocessing the input keywords''' global start # print "use-preprocessing: ",time.clock()-start tokens = tok.normalizer(keywords.lower()) tokens = tok.stop_words(tokens) tokens = tok.stemmer(tokens) tokens = tok.lemmatizer(tokens) return tokens
def phraseQuery(phrase, trie): # First we will tokenize the phrase using our tokenizer we used to make the index tokenizer = Tokenizer() phrase = tokenizer.stemQuery(phrase) result = [] # Now get the occurrence dictionary for each word and append it to result for word in phrase: occurrences = trie.getOccurrences(word) # If the dict is empty do not append it if (occurrences != []): result.append(occurrences) # We now know that if length of result is smaller than phrase that we have no phrase matches # so we can return an empty list as the result # or no matches or something if (len(phrase) != len(result)): return [] # Now for each key we must see if its in the next one, and if it is make sure the positions are correct # If we wanted to optimize this further we would make it choose the word with the smallest amount of occurrences # But that complicates things and goes above what is required result2 = set() requiredMatches = len(result) - 1 # Same thing, check for no occurences if result == []: return set() firstTerm = result[0] # print(result) for docID in firstTerm: # For each docID we must compare each position with position+1 in the other dictionaries matches = 0 positions = firstTerm[docID] # print(result, positions) # If 0 occurrences if positions == 0: continue for position in positions: position2 = position + 1 for i in range(1, requiredMatches + 1): # Make sure word appears in the same document and position+1 exists if (docID in result[i] and position2 in result[i][docID]): matches += 1 position2 += 1 # print(docID, position, position2, positions, matches) if (matches == requiredMatches): result2.add(docID) return result2
def testAdvanceTokenShouldReturnThreeWordWithOneFinalEnd(self): global tokenzier tokenizer=Tokenizer('a b c ') gen=tokenizer.advanceToken() a=next(gen) self.assertEqual(a,'a') b=next(gen) self.assertEqual(b,'b') c=next(gen) self.assertEqual(c,'c') d=next(gen) self.assertIsNone(d)
def tag(test_items=[], WS="", tokenize=False, tokenizer_context=0, left_context=1, right_context=1, gazetteer=None, min_tok_freq=25, min_lem_freq=25, mode=""): """ Tags a list of (potentially annotated) test tokens. """ tokenized_tokens = [] token_acc, token_f1 = None, None if tokenize: # load and apply a tokenizer: tokenizer = Tokenizer(context=tokenizer_context, WS=WS) if mode == "tag": tokenized_tokens = tokenizer.tokenize(test_items=test_items, gazetteer=gazetteer) elif mode in ("test", "crossval"): items = [] for item in test_items: if item == "<utt>": items.append(item) else: items.append(item[0].lower()) token_acc, token_f1 = tokenizer.eval_tokenizer(test_items=items, gazetteer=gazetteer) # return the original tokens since we only tokenize for evaluation purposes: tokenized_tokens = items else: # assume the input has been properly tokenized already: if mode == "tag": tokenized_tokens = test_items elif mode == "test": tokenized_tokens = tuple(item[0].lower() for item in test_items) sequential_tagger = MaxentTagger(WS=WS, left_context=left_context, right_context=right_context, min_tok_freq=min_tok_freq, min_lem_freq=min_lem_freq) sequential_tagger.load_models() tagged_items = sequential_tagger.tag(tokenized=tokenized_tokens, gazetteer=gazetteer) if mode in ("crossval", "test"): results = sequential_tagger.evaluate_tags_and_lemmas(gold_items=test_items,\ silver_items=tagged_items) if tokenize: results.extend((token_acc, token_f1)) return results else: return tagged_items
def determine_usage(resultObj, categorizerObj): def deep_find(val, iterable): if (not isinstance(iterable, list)) and (not isinstance(iterable, tuple)): if val == iterable: return True else: for element in iterable: if deep_find(val, element): return True return False assignments = categorizerObj.tokenCategories.assignments other = categorizerObj.tokenCategories.other usage = list() out =list() if resultObj.alias: searchPattern = resultObj.alias else: searchPattern = resultObj.name.split(".")[-1] for assignment in assignments: aux = list() tokValues = [val[1] for val in assignment] index = find_all_indices(searchPattern, tokValues) if index: aux.append(tokenizer.untokenize(assignment)) aux.append(assignment[0][2][0]) usage.append(aux) for element in other: aux = list() tokValues = [val[1] for val in element] index = find_all_indices(searchPattern, tokValues) if index: aux.append(tokenizer.untokenize(element)) aux.append(element[0][2][0]) usage.append(aux) # out = list() for case in usage: obj = Usage() obj.name = resultObj.name obj.usage = case[0] obj.lineNumber = case[1] obj.parent = categorizerObj.file out.append(obj) return out
def TF(documents, outputFormat='sparseMatrix', progressBar=True): ''' Parameters: documents: string, list an string contains a folder name of corpus or list of documents outputFormat: string 'sparseMatrix' or 'pandas_DataFrame' returns: <class dict> { <class str> documentName: <class dict> { <class Token> : <class int> frequency }} ''' if type(documents) == str: if not path.isdir(documents): raise ValueError("you must give me a folder name or list of files") documents = [ path.join(documents, i) for i in listdir(documents) if path.isfile(path.join(documents, i)) ] if len(documents) == 0: raise ValueError("this folder is empty or has no file!!!") if type(documents) == list: if not all(path.isfile(i) for i in documents): raise ValueError("no file is this directory!!!") else: raise ValueError("you must give me a folder name or list of files") if outputFormat not in ['sparseMatrix', 'pandas_DataFrame']: raise ValueError( "The outputFormat must be 'sparseMatrix' or 'pandas_DataFrame'") if not progressBar: tqdm = lambda x: x else: from tqdm import tqdm TF_mat = {} for fileName in tqdm(documents): if not fileName in TF_mat: fileObj = open(fileName) TF_mat.update({ fileName: tok.wordCounter(tok.wordTokenizer(fileObj.read())[0]) }) fileObj.close() if outputFormat == 'sparseMatrix': return TF_mat elif outputFormat == "pandas_DataFrame": return pd.DataFrame(TF_mat, columns=sorted(documents)).fillna(0).sort_index()
def __init__(self, inp): self.toker = Tokenizer(inp) self.saved = [] # pushed-back tokens self.first = True self.currentFile = "" self.currentSymTable = {} self.localCounter = 0 self.className = "" self.classTable = {} self.ifCounter = 0 self.elseCounter = 0 self.whileCounter = 0 self.fieldCounter = 0 self.staticCounter = 0
def Driver(): #Get Acorn script from input try: argument = sys.argv[1] except: sys.exit(CYAN + "Acorn: " + RED + "Expected acorn script." + WHITE) #test to see if input type is correct if (not argument.lower().endswith('.acorn')): sys.exit("Acorn: Expected acorn file.") dataFile = open(argument, "r") #raw is a whole string which is the whole Acorn script raw = dataFile.read() dataFile.close() acorn = Lexer.LexerClass() acorn.lexer(raw) #Send tokens to tokenizer and get an Abstract syntax tree back: ast is a 2nd array acornStackFrame = acorn.stackFrame Mem = Memory.Memory() #s = time.time() for i in range(0, len(acornStackFrame)): subStack = acornStackFrame[i] #print(subStack) ast = Tokenizer.Tokenizer(subStack, Mem) #print(Mem.heap) astp = ast.grammar() #print(astp) #print(astp.expr1()) #print(astp) Parser.step(astp, Mem)
def main(argv): # Open file and handle any errors f = open(argv[0], "r") # Init file used to track our globals t.tokenizer = Tokenizer.Tokenizer(f) # Init top-level object program = Prog.Prog() # Form parse tree, which recursively calls parse() on each nonterminal program.parse() # Recursively print parse tree out print("\nprint() output: ") program.print() # Recursively execute parse tree print("\nexec() output: ") program.exec() # Close file f.close() # Exit exit(0)
class TweetsHandler(): tokenizer = Tokenizer() def tweetsToWords(self, filename): tweetTokens = [] with open(fname, 'r') as f: for line in f: tweet = json.loads(line) if 'text' not in tweet.keys(): continue #lineTokens = self.tokenizer.tokenize(tweet['text']) lineTokens = nltk.word_tokenize(tweet['text']) print lineTokens tagged = nltk.pos_tag(lineTokens) tokenJJ = [ term for term in tagged if (term[1] == 'JJ' or term[1] == 'JJR' or term[1] == 'JJS' ) ] # only adjective tweetTokens += tokenJJ return tweetTokens def countAssociation(self, tweetTokens): cnt = Counter() cnt.update(tweetTokens) return cnt.most_common(20)
def gather(self, tokens): """ Gathers words within program for execution """ defined_words = {} current_word = [] found_word = 0 import_called = 0 for token in tokens: if import_called: import_called = 0 token = "..\\" + token + ".pyfth" #print token imported_program = open(token) defined_words.update( self.gather(Tokenizer.tokenize(imported_program))) #print defined_words imported_program.close() elif found_word: if token == ";": found_word = 0 current_word.append("return") defined_words[current_word_name] = current_word current_word = [] else: current_word.append(token) elif token == "import": import_called = 1 else: #print "found word: ",token found_word = 1 current_word_name = token return defined_words
def analyzeT(jackFile): tokenizedXmlFilename = os.path.splitext(jackFile)[0] + "T.xml.cmp" outputFile = open(tokenizedXmlFilename, 'w') outputFile.write("<tokens>\r\n") t = Tokenizer.Tokenizer(jackFile) t.advance() while t.hasMoreTokens(): tokenType = t.tokenType() if tokenType == Tokenizer.TokenType.KEYWORD: outputFile.write("<keyword> " + t.keyword() + " </keyword>") elif tokenType == Tokenizer.TokenType.SYMBOL: outputFile.write("<symbol> " + charXMLify(t.symbol()) + " </symbol>") elif tokenType == Tokenizer.TokenType.IDENTIFIER: outputFile.write("<identifier> " + t.identifier() + " </identifier>") elif tokenType == Tokenizer.TokenType.INT_CONST: outputFile.write("<integerConstant> " + t.intVal() + " </integerConstant>") elif tokenType == Tokenizer.TokenType.STRING_CONST: outputFile.write("<stringConstant> " + t.stringVal() + " </stringConstant>") else: pdb.set_trace() print("Invalid") outputFile.write("\r\n") t.advance() outputFile.write("</tokens>") outputFile.write("\r\n") outputFile.close()
def analyze(self): for jack_file in self.jackFiles: tokenizer = T.Tokenizer(jack_file) xml_file = jack_file.replace('.jack', '.xml') comp_engine = CE.Parsing(tokenizer, xml_file) comp_engine.outFile.close() tokenizer.close()
def main(): program = sys.argv[1] #tk = Tokenizer.Tokenizer(program)#Create program to be tokenized, based on the rules of the languages files = [ "validAllOneLine.txt", "validAllSimpleExpressions.txt", "validBooleanComplex.txt", "validComplexExpressions.txt", "validMinimalWhitespace.txt", "validTypicalIfElse.txt", "validTypicalLoop.txt" ] #for i in range(len(files) - 1): #program = "validTypicalIfElse.txt" tk = Tokenizer.Tokenizer(program) while (tk.lcurrentToken() != 'EOF'): #Search through all the tokens tk.lprocessTokens() #Print out code tk.lnextToken() tk.lprocessTokens() tk.lcloseFile() tk.tokens.append(tok.Token('end', 5)) parser = nodes.ProgramNode() parser.parseProgram(tk) #print("PARSE COMPLETE!!") #print("==================") #print(nodes.symTab) #print(" ") parser.printProgram() parser.execProgram()
def create_spacy_hu(): nlp = spacy.blank('hu') nlp.tokenizer = Tokenizer.HuTokenizer(nlp.vocab) morph_analyzer = LemmatizerMorphAnalyzer.HuLemmaMorph(nlp) nlp.add_pipe(morph_analyzer) constitutency_parser = ConstitutencyParser.ConstitutencyParser(nlp) nlp.add_pipe(constitutency_parser) dependency_parser = DependencyParser.DependencyParser(nlp) nlp.add_pipe(dependency_parser) np_chunker = NPChunker.NPChunker(nlp) nlp.add_pipe(np_chunker) POS_analyzer = POSTagger.HuPOSTagger(nlp) nlp.add_pipe(POS_analyzer) preverb_identifier = PreverbIdentifier.PreverbIdentifier(nlp) nlp.add_pipe(preverb_identifier) hu_word_to_vec = HuWordToVec.HUWordToVec() nlp.add_pipe(hu_word_to_vec) return nlp
def scan_file(file, stdLib): # make sure to use pathlib.Path objects otherwise throw an arror try: file = pathlib.Path(file) except TypeError: raise TypeError( "input <{error_cause}> for 'file' does not match {type_name}".format( type_name = pathlib.Path, error_cause = str(file) ) ) if file in stdLib: return False tok = tokenizer.TokenCategorizer(file) modules, callables = imported_modules(tok) lokalClasses, lokalFunctions = lokal_callables(tok) out = { "modules": modules, "callables": callables, "lokalClasses": lokalClasses, "lokalFunctions": lokalFunctions } return out
def compile_file(jack_file_name, xml_file_name): # print("Starting compilation.\nSource file: "+jack_file_name+"\nDestination file: "+xml_file_name+"\n") jack_file = open(jack_file_name, 'r') tokenizer = Tokenizer.Tokenizer(jack_file) xml_file = open(xml_file_name, 'w') compilation_engine = CompilationEngine(tokenizer, xml_file) compilation_engine.compile_class()
def run(code): Parser.tokens = Tokenizer.Tokenizer(code) Parser.tokens.select_next() r = Parser.parse_program() if Parser.tokens.actual.value == 'EOF': return r else: raise Exception(f'Expected EOF instead got {Parser.tokens.actual.value}')
def compile_file(jack_file_name, vm_file_name): jack_file = open(jack_file_name, 'r') tokenizer = Tokenizer.Tokenizer(jack_file) symbol_table = SymbolTable.SymbolTable() vm_file = open(vm_file_name, 'w') vm_writer = VMWriter.VMWriter(vm_file) compilation_engine = CompilationEngine(tokenizer, vm_writer, symbol_table) compilation_engine.compile_class()
def labelQueryTerm(self, tweetsList): tokenizer = Tokenizer.Tokenizer() for tweet in tweetsList: termsInTweets = tokenizer.tokenize(tweet[2], 'simple') for term in termsInTweets: if term in list(self.queryDict.keys()): self.queryDict[term].append(tweet[1]) self._scoreQueryTerms()
def run(code): Parser.tokens = Tokenizer(code) result = Parser.program() if Parser.tokens.actual.type == 'EOF': return result else: raise SyntaxError( "Invalid Chain Exception (tip: do not put spaces between numbers)" )
def Number(): token = Tokenizer.PeakToken() negate = token.IsSymbol(['-']) if token.IsSymbol(['+','-']): # ignore unary + # use negate for unary - Tokenizer.Consume() token = Tokenizer.PeakToken() if token.IsNumber(): Tokenizer.Consume() text = ('-' if negate else '') + token.text val = float(text) if '.' in text else int(text) return ConstantFunction(val) else: raise ValueError("Invalid Number")
def index(self): for doi, title, abstract in self.col: if self.tokenizerType == '0': # simple tokenizer = Tokenizer.SimpleTokenizer(title, abstract) else: # better tokenizer = Tokenizer.BetterTokenizer(title, abstract) terms = tokenizer.getTerms() for term in terms: if term in self.term_map.keys(): if doi in self.term_map[term].keys(): self.term_map[term][doi] += 1 else: self.term_map[term][doi] = 1 else: term_freq_map = {} # key: docId, value: term_freq term_freq_map[doi] = 1 self.term_map[term] = term_freq_map
def run(code): Parser.tokens = tkr.Tokenizer(code) Parser.tokens.selectNext() node = Parser.parseBlock() current = Parser.tokens.actual if current.type == "EOF": return node else: raise Exception("Tokenizer nao chegou no EOF")
def __init__(self, input, output): """ :param input: input file name :param output: output file name whhere the text will be written """ self.tokenizer = Tokenizer.Tokenizer(input) self.parsedrule = [] self.output = open(output, "w") self.indent = ""
def TermTail(leftParseTree): token = Tokenizer.PeakToken() if token.IsSymbol(['+','-']): Tokenizer.Consume() parseTree = BinaryFunction(token.text) parseTree.lchild(leftParseTree) parseTree.rchild(Term()) return TermTail(parseTree) elif token.IsEOF() or \ token.IsSymbol(')'): return leftParseTree else: raise ValueError("Term Tail")
def getBinTestText(fileName) : fileText = FileIO.readFile(fileName); tokens = Tokenizer.tokenizer(fileText); wordOccurence = {}; resultTokens = ""; for token in tokens.split("\n"): if token not in wordOccurence: wordOccurence[token] = 1; resultTokens = resultTokens + token + "\n"; return resultTokens;
def error_printer(error_suggestion, sentence_list): for item in error_suggestion: args = Tokenizer.mapping(item[0], sentence_list) print '第', args[0] + 1, '句句子中第', args[1] + 1, '个词', item[1], '出现拼写错误;' print '可能正确的词为:', num = 1 for word in item[2]: print num, ')', word, num += 1 print '\n'
def Term(): token = Tokenizer.PeakToken() if (token.IsNumber() or \ token.IsFunc() or \ token.IsVariable() or \ token.IsSymbol(['+', '-', '('])): parseTree = Factor() return FactorTail(parseTree) else: raise ValueError("Term")
def analyze(jackFile): outputFilename = os.path.splitext(jackFile)[0] + ".xml.cmp" t = Tokenizer.Tokenizer(jackFile) ce = CompilationEngine.CompilationEngine(t, outputFilename) t.advance() if t.keyword() != "class": print("jack file does not have a class!") exit(1) ce.CompileClass()
def Expression(): token = Tokenizer.PeakToken() if (token.IsNumber() or \ token.IsFunc() or \ token.IsVariable() or \ token.IsSymbol(['+', '-', '('])): parseTree = Term() return TermTail(parseTree) else: raise ValueError("Expression")
def __init__(self, input, output): """ :param input: input file name :param output: output file name whhere the text will be written """ self.tokenizer = Tokenizer.Tokenizer(input) self.writer = VMWriter.VMWriter(output) self.symbolTable = SymbolTable.SymbolTable() self.classname = "" self.name = ""
def get_queries(self): queries, queries_1 = self.read_queries() tokenize = Tokenizer.Tokenize(" ") self.modified_queries = tokenize.process_data(queries) self.modified_queries_1 = queries_1 f = open("queries for lucene.txt", 'w') i = 0 for q in self.modified_queries: q = q.strip("\n") q = q.replace("\n", ' ') f.write(str(q)) f.write("\n")
def idf(): IDF = {} numDocs = 0 # Get all the products from the database dat = data.getDatabase() for table in dat: #print '.' # Go through each product in each table for product in dat[table]: item = product[3] # Get their reviews revs = GetReviews.readReview(item)["Reviews"] try: for r in revs: #print r # Tokenize and Stem reviews con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons'])) pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros'])) comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments'])) #print 'Before:',r['Cons'],'\n\nAfter:',con # Count unique tokens in the document for token in list(set(con) | set(pro) | set(comment)): if token in IDF: IDF[token] = IDF[token] + 1 else: IDF[token] = 1 numDocs = numDocs + 1 # Increment the number of documents except: pass # Calculate and return the idf score for term in IDF: IDF[term] = math.log(float(numDocs)/float(IDF[term])) pickle.dump(dict(IDF),open('../data/idf.pickle','wb')) # Pickling saves SOOO much time return IDF
def tf_idf(): TF_IDF = {} # Load the inverse document frequencies #IDF = idf() IDF = dict(pickle.load(open('../data/idf.pickle','rb'))) dat = data.getDatabase() # get all of the products for table in dat: print '.' # progress marker for product in dat[table]: # For each product in each table item = product[3] # Item number is [3] in the tuple revs = GetReviews.readReview(item)["Reviews"] # we want to read the actual reviews product_review = [] try: for r in revs: # for each review tf = {} # Tokenize and stem the entire review con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons'])) pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros'])) comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments'])) # combine pros, cons, and comments sections for token in list(con+pro+comment): # calculate the term frequencies if token in tf: tf[token] = tf[token] + 1 else: tf[token] = 1 for t in tf: tf[t] = float(1+math.log(tf[t]))*IDF[t] # calculate tf-idf score product_review.append(tf) # add to list of reviews except: pass TF_IDF[item] = product_review # add list of reviews to the dictionary return TF_IDF
def process(self, line, is_inside_until=False): statement = Tokenizer(line.strip(' ').strip('\n'), ' ') while statement.has_next(): token = statement.next() if token in ['.', '.s', 'CR']: if token == '.': sys.stdout.write(self.stack.pop()) self.usedWrite = True elif token == 'CR': print else: print self.stack elif token in self.memory: self.process(self.memory[token]) elif token in ['DUP', '+', '-', '*', '/', 'SWAP', 'DROP', '<', '>', '<=', '>=', '=', 'EMIT', 'MOD', 'KEY', 'DEPTH', 'ROLL', 'PICK']: import Operators Operators.Op[token](self.stack) elif token == ':': self.function_definition(statement) elif token == 'IF': self.handle_if(statement) elif token == 'DO': self.do_loop(statement) elif token == 'BEGIN': self.until_loop(statement) elif token == 'LEAVE': if is_inside_until and self.stack.pop() != '0': return False elif token == 'EXPECT': self.expect() elif token.startswith('."'): self.print_string(token, statement) elif token.isdigit(): self.stack.push(token) elif token.strip() == '': pass else: raise NameError('Invalid Input: ' + token) return True
def _testParsing(self, st, *L): '''Tests that parsing a particular string all at once will return the given sequence of tokens. The Wait token at the end is implicit.''' # TODO: Test parsing it character-by-character too! #print 'Testing '+repr(st) L = self._collapseText(list(L)) + [WaitToken()] p = Tokenizer() p.queueData(st) resultToks = [] while 1: tok = p.getNextToken() resultToks.append(tok) if isinstance(tok, WaitToken): break resultToks = self._collapseText(resultToks) self.assertEqual(len(L), len(resultToks), "Error parsing '%s': Expected token list %s, got %s" % \ (repr(st), repr(L), repr(resultToks))) for tok,result in zip(L,resultToks): self.assertEqual(tok, result, \ "Error parsing '%s': Expected %s but got %s" % \ (repr(st), repr(L), repr(resultToks))) self.assertEqual(p.getNextToken(), WaitToken(), 'When waiting, repeated calls to getNextToken() should '+\ 'continue to return WaitForMoreDataTokens')
def testPeepAheadOnce(self): global tokenizer tokenizer=Tokenizer('a b') a=tokenizer.peepahead() self.assertEqual(valueof(a),'a') a=tokenizer.peepahead() self.assertEqual(valueof(a),'a') a=tokenizer.advance() self.assertEqual(valueof(a),'a') b=tokenizer.peepahead() self.assertEqual(valueof(b),'b') b=tokenizer.peepahead() self.assertEqual(valueof(b),'b') b=tokenizer.advance() self.assertEqual(valueof(b),'b')
def buildCorpus(dirPath, corpusName): print('creating corpus!!'); tagClass = fetchLabel(dirPath); classCnt = FileIO.countFiles(dirPath); dictionary = {}; FileIO.wrtieToFile("corpus\classCount.txt", 'a', (tagClass + '\t' + str(classCnt) + '\n' )); for dir_entry in os.listdir(dirPath): text = FileIO.readFile(os.path.join(dirPath, dir_entry)); text = Tokenizer.tokenizer(text); for token in text.split('\n'): if token not in dictionary: dictionary[token] = {}; if tagClass not in dictionary[token]: dictionary[token][tagClass] = 0; dictionary[token][tagClass] = dictionary[token][tagClass] + 1; for key, value in dictionary.items(): FileIO.wrtieToFile(corpusName, 'a', (key + '\t' + str(value[tagClass]) + '\t' + tagClass + '\n')); print('Corpus creation : Done..');
def parse(str): array=[] CKeyword.defineTable={} CKeyword.stringTable={} tokenizer=Tokenizer(str) token=tokenizer.peepahead() CExpression.configure_tokenizer_Expression(tokenizer) CKeyword.configure_tokenizer_Keyword(tokenizer) while(token.first == ';'): tokenizer.advance() token=tokenizer.peepahead() while(token.first != '(end)'): if hasattr(token,'std'): temp=CKeyword.parseStatement() else: temp=CExpression.expression(0) tokenizer.advance(';') token=tokenizer.peepahead() array.append(temp) return array
def BuildTrainingSet(): # initialize variables database = data.getDatabase() IDF = dict(pickle.load(open('../data/idf.pickle','rb'))) numReviews = 0 posReview = {} numPos = 0 negReview = {} numNeg = 0 # For each product in each subcategory for table in database: for product in database[table]: item = product[3] revs = GetReviews.readReview(item)["Reviews"] # get the review try: for r in revs: if (numReviews%37)==0: # Analyze every 37th review tf = {} # Get reviews for you to read con = r['Cons'] pro = r['Pros'] comment = r['Comments'] # Read the reviews print pro,' :: ',con,' :: ',comment # set up to add to training set con = Tokenizer.stemming(Tokenizer.tokenize(r['Cons'])) pro = Tokenizer.stemming(Tokenizer.tokenize(r['Pros'])) comment = Tokenizer.stemming(Tokenizer.tokenize(r['Comments'])) # Treat all parts as one review for token in list(con+pro+comment): if token in tf: tf[token] = tf[token] + 1 else: tf[token] = 1 for t in tf: # tf-idf formula tf[t] = float(1+math.log(tf[t]))*IDF[t] # hopefully you have had time to read, now decide Q = int(raw_input('\n1 for good.... 0 for bad.....\n').rstrip('\n')) if Q==1: # Good posReview[numPos] = tf # add to training set numPos = numPos + 1 elif Q==0: # Bad negReview[numNeg] = tf # add to training set numNeg = numNeg + 1 else: print 'FAIL!!!!!!' numReviews = numReviews + 1 # increase number of reviews except: pass saveSet(posReview,negReview) # Save the training sets return (numPos, numNeg)
def buildBinarizedCorpus(dirPath, corpusName): print('creating binarized corpus!!'); tagClass = fetchLabel(dirPath); classCnt = FileIO.countFiles(dirPath); FileIO.wrtieToFile("corpus\classCount.txt", 'a', (tagClass + '\t' + str(classCnt) + '\n' )); corpusDict = {}; for dir_entry in os.listdir(dirPath): fileTokens = {}; text = FileIO.readFile(os.path.join(dirPath, dir_entry)); text = Tokenizer.tokenizer(text); for token in text.split('\n'): if token not in fileTokens: fileTokens[token] = 1; if token not in corpusDict: corpusDict[token] = {}; corpusDict[token][tagClass] = 1; else: corpusDict[token][tagClass] = corpusDict[token][tagClass] + 1; for key, value in corpusDict.items(): FileIO.wrtieToFile(corpusName, 'a', (key + '\t' + str(value[tagClass]) + '\t' + tagClass + '\n' )); print('binarized corpus creation done!!');
def testAdvanceShouldReturnEndWhenGivenNone (self): global tokenzier tokenizer=Tokenizer(None) end=tokenizer.advance() self.assertEqual(valueof(end),'(end)')
remove_999= True translate = True noise_variance = 0. n_classes='multiclass' datapath = 'Datasets/kaggle_higgs/' # Import the data: print(" ") # Load the dataset: print("Loading dataset...") train_s, train_s_2, valid_s, test_s = tokenizer.extract_data( split = True, normalize = normalize, remove_999 = remove_999, translate = translate, noise_variance = 0., n_classes = n_classes, datapath = datapath, train_size = 180000, train_size2 = 35000, valid_size = 35000) train_set_x = [] train_set_x_2 = [] valid_set_x = [] test_set_x = [] for i in range(len(train_s[1])): train_set_x.append(theano.shared(np.asarray(train_s[1][i], dtype=theano.config.floatX), borrow= True)) train_set_x_2.append(theano.shared(np.asarray(train_s_2[1][i],
path = '' if len(sys.argv)==2: path = sys.argv[1] else: print "No path input" sys.exit() bc = BookCleaner(path) reload(sys) sys.setdefaultencoding('utf8') #AGGREGATE RAW TEXT text_file = open("big_training_raw.txt", "w+") print "BookCleaner is now aggregating all texts" str = bc.getAllFilesCleaned() text_file.write(str) text_file.close() #TOKENIZE print "Tokenizing texts" text_file = open("tokenized_train.txt","w+") file = open("big_training_raw.txt") tk = Tokenizer() for line in file: line = unicode(line, errors='replace') str=tk.tokenizeAdvanced(line) text_file.write(str) text_file.close()
def main(): a = "這邊介紹一個叫做get的method,只適用於dictionary" tk.tokenize(a)
import Modded_PyfthTools import Tokenizer import sys Pyfth_instance = PyfthTools.Pyfth() pyfth_file_name = sys.argv[1] if pyfth_file_name[-6:] != ".pyfth": pyfth_file_name += ".pyfth" pyfth_file = open(pyfth_file_name) pyfth_file_tokens = Tokenizer.tokenize(pyfth_file) Pyfth_instance.input_tokens(pyfth_file_tokens) Pyfth_instance.run()
print "----------------------------- DATASET ----------------------------" data = 'higgs' # 'mnist.pkl.gz' print "Dataset studied : {0}".format(data) normalize= True remove_999= True translate = True noise_variance = 0. n_classes='binary', # Import the data: print(" ") # Load the dataset: print("Loading dataset...") train_s, valid_s, test_s = tokenizer.load_higgs(split= True, normalize= normalize, remove_999= remove_999, noise_variance = noise_variance, n_classes= n_classes, translate= True) train_set_x = train_s[1] train_set_y = train_s[2] valid_set_x = valid_s[1] valid_set_y = valid_s[2] test_set_x = test_s[1] # Only one DNN is requiered: print "--------------------------- PARAMETERS ---------------------------" print "---- LEARNING: " epochs = 2 print " Number of epochs : {0}".format(epochs)
def getPerWordPerplexity(self,line): t = Tokenizer() line = t.tokenize(line) perplexity = self.calculatePerWordPerplexity(line) return -perplexity
import sChecker.spell_checker as spell import readline import Tokenizer as tkr class color: END = '\033[0m' COW = '\033[0;33m' MIW = '\033[0;91m' while True: try: _input = input("You >> ") c_list = [] #_word = input("Enter the word >> ") _list = tkr.tokenize(_input) for _word in _list: #word = spell.words(_word) c_word = spell.correct(_word) c_list.append(c_word) for i in range(len(c_list)-1): print('\t'+color.MIW+_list[i]+color.END+' -> '+color.COW+c_list[i]+color.END) print('') #if (_word == c_word): #print("Correct Word : "+color.COW+c_word+color.END) #else:
import Tokenizer from math import log i = input('Enter within quotes, m for movie reviews corpus,' 'r for reuters corpus( default is reuters) : ') corpus='' if i=='m' or i=='M': corpus='mr' else: corpus='reuters' start_time = time.time() list_fileids = Tokenizer.get_list_fileids(corpus) #val = my_dict.get(key, mydefaultval) ##1)Create a dictionary with word as key and list of documents where it occurs in sorted order as value word_doc_dict={} ##2)Loop through the dataset, to get the entire text from each file for (file_index,file_name) in enumerate(list_fileids): list_words = Tokenizer.get_list_tokens_nltk(corpus,file_name) ##3) Parse the string to get individual words #!!!!!!!!------Possible Improvement: Stemming--------------#