class TweetsHandler(): tokenizer = Tokenizer() def tweetsToWords(self, filename): tweetTokens = [] with open(fname, 'r') as f: for line in f: tweet = json.loads(line) if 'text' not in tweet.keys(): continue #lineTokens = self.tokenizer.tokenize(tweet['text']) lineTokens = nltk.word_tokenize(tweet['text']) print lineTokens tagged = nltk.pos_tag(lineTokens) tokenJJ = [ term for term in tagged if (term[1] == 'JJ' or term[1] == 'JJR' or term[1] == 'JJS' ) ] # only adjective tweetTokens += tokenJJ return tweetTokens def countAssociation(self, tweetTokens): cnt = Counter() cnt.update(tweetTokens) return cnt.most_common(20)
def compile_file(jack_file_name, xml_file_name): # print("Starting compilation.\nSource file: "+jack_file_name+"\nDestination file: "+xml_file_name+"\n") jack_file = open(jack_file_name, 'r') tokenizer = Tokenizer.Tokenizer(jack_file) xml_file = open(xml_file_name, 'w') compilation_engine = CompilationEngine(tokenizer, xml_file) compilation_engine.compile_class()
def analyze(self): for jack_file in self.jackFiles: tokenizer = T.Tokenizer(jack_file) xml_file = jack_file.replace('.jack', '.xml') comp_engine = CE.Parsing(tokenizer, xml_file) comp_engine.outFile.close() tokenizer.close()
def main(argv): # Open file and handle any errors f = open(argv[0], "r") # Init file used to track our globals t.tokenizer = Tokenizer.Tokenizer(f) # Init top-level object program = Prog.Prog() # Form parse tree, which recursively calls parse() on each nonterminal program.parse() # Recursively print parse tree out print("\nprint() output: ") program.print() # Recursively execute parse tree print("\nexec() output: ") program.exec() # Close file f.close() # Exit exit(0)
def Driver(): #Get Acorn script from input try: argument = sys.argv[1] except: sys.exit(CYAN + "Acorn: " + RED + "Expected acorn script." + WHITE) #test to see if input type is correct if (not argument.lower().endswith('.acorn')): sys.exit("Acorn: Expected acorn file.") dataFile = open(argument, "r") #raw is a whole string which is the whole Acorn script raw = dataFile.read() dataFile.close() acorn = Lexer.LexerClass() acorn.lexer(raw) #Send tokens to tokenizer and get an Abstract syntax tree back: ast is a 2nd array acornStackFrame = acorn.stackFrame Mem = Memory.Memory() #s = time.time() for i in range(0, len(acornStackFrame)): subStack = acornStackFrame[i] #print(subStack) ast = Tokenizer.Tokenizer(subStack, Mem) #print(Mem.heap) astp = ast.grammar() #print(astp) #print(astp.expr1()) #print(astp) Parser.step(astp, Mem)
def addDocsToTrie(docs, trie): # Takes a list of document paths and adds them to the given trie tokenizer = Tokenizer() for doc in docs: tokens = tokenizer.stemDocument(doc[2]) doc[1].reverse() # insert thing so we can pad tokens.insert(0, "+") for word in doc[1]: tokens.insert(0, tokenizer.qregex.sub("", word.lower())) position = 0 for token in tokens: if (token == "+"): position += 1 continue # print(token) # If the token has !?. in it, remove it then increment position if (token[-1] in ".?!" and token not in "mr. ms. mrs."): trie.addOccurence(token[:-1], doc[0], position) # print(token, token[:-1]) position += 2 continue trie.addOccurence(token, doc[0], position) position += 1
def analyzeT(jackFile): tokenizedXmlFilename = os.path.splitext(jackFile)[0] + "T.xml.cmp" outputFile = open(tokenizedXmlFilename, 'w') outputFile.write("<tokens>\r\n") t = Tokenizer.Tokenizer(jackFile) t.advance() while t.hasMoreTokens(): tokenType = t.tokenType() if tokenType == Tokenizer.TokenType.KEYWORD: outputFile.write("<keyword> " + t.keyword() + " </keyword>") elif tokenType == Tokenizer.TokenType.SYMBOL: outputFile.write("<symbol> " + charXMLify(t.symbol()) + " </symbol>") elif tokenType == Tokenizer.TokenType.IDENTIFIER: outputFile.write("<identifier> " + t.identifier() + " </identifier>") elif tokenType == Tokenizer.TokenType.INT_CONST: outputFile.write("<integerConstant> " + t.intVal() + " </integerConstant>") elif tokenType == Tokenizer.TokenType.STRING_CONST: outputFile.write("<stringConstant> " + t.stringVal() + " </stringConstant>") else: pdb.set_trace() print("Invalid") outputFile.write("\r\n") t.advance() outputFile.write("</tokens>") outputFile.write("\r\n") outputFile.close()
def main(): program = sys.argv[1] #tk = Tokenizer.Tokenizer(program)#Create program to be tokenized, based on the rules of the languages files = [ "validAllOneLine.txt", "validAllSimpleExpressions.txt", "validBooleanComplex.txt", "validComplexExpressions.txt", "validMinimalWhitespace.txt", "validTypicalIfElse.txt", "validTypicalLoop.txt" ] #for i in range(len(files) - 1): #program = "validTypicalIfElse.txt" tk = Tokenizer.Tokenizer(program) while (tk.lcurrentToken() != 'EOF'): #Search through all the tokens tk.lprocessTokens() #Print out code tk.lnextToken() tk.lprocessTokens() tk.lcloseFile() tk.tokens.append(tok.Token('end', 5)) parser = nodes.ProgramNode() parser.parseProgram(tk) #print("PARSE COMPLETE!!") #print("==================") #print(nodes.symTab) #print(" ") parser.printProgram() parser.execProgram()
def labelQueryTerm(self, tweetsList): tokenizer = Tokenizer.Tokenizer() for tweet in tweetsList: termsInTweets = tokenizer.tokenize(tweet[2], 'simple') for term in termsInTweets: if term in list(self.queryDict.keys()): self.queryDict[term].append(tweet[1]) self._scoreQueryTerms()
def run(code): Parser.tokens = Tokenizer.Tokenizer(code) Parser.tokens.select_next() r = Parser.parse_program() if Parser.tokens.actual.value == 'EOF': return r else: raise Exception(f'Expected EOF instead got {Parser.tokens.actual.value}')
def compile_file(jack_file_name, vm_file_name): jack_file = open(jack_file_name, 'r') tokenizer = Tokenizer.Tokenizer(jack_file) symbol_table = SymbolTable.SymbolTable() vm_file = open(vm_file_name, 'w') vm_writer = VMWriter.VMWriter(vm_file) compilation_engine = CompilationEngine(tokenizer, vm_writer, symbol_table) compilation_engine.compile_class()
def run(code): Parser.tokens = Tokenizer(code) result = Parser.program() if Parser.tokens.actual.type == 'EOF': return result else: raise SyntaxError( "Invalid Chain Exception (tip: do not put spaces between numbers)" )
def run(code): Parser.tokens = tkr.Tokenizer(code) Parser.tokens.selectNext() node = Parser.parseBlock() current = Parser.tokens.actual if current.type == "EOF": return node else: raise Exception("Tokenizer nao chegou no EOF")
def __init__(self, input, output): """ :param input: input file name :param output: output file name whhere the text will be written """ self.tokenizer = Tokenizer.Tokenizer(input) self.parsedrule = [] self.output = open(output, "w") self.indent = ""
def tokenize(self, smiles): t = Tokenizer() smile_tokens= [] for smile in smiles: smile_tokens.append(t.tokenize(smile)) self.max_len = t.max_len self.one_hot_dict = t.one_hot_dict self.table_len = t.table_len self.table = t.table return smile_tokens
def __init__(self, depth): tokenizer = Tokenizer() tokenizer.on_word = lambda x: self.__on_word(x) self.tokenizer = tokenizer root = Node(Shared.make_key([""])) self.nodeByWords = {root.key: root} self.depth = depth self.history = [root] self.wordCount = 0
def tokenizeData(X_train,X_test, vocab): "tokenize data" #init tokenizer tokenizer= Tokenizer(num_words=vocab, filters='\t\n',char_level=False) #use tokenizer to split vocab and index them tokenizer.fit_on_texts(X_train) ##txt to seq X_train= tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) return X_train,X_test
def __init__(self, input, output): """ :param input: input file name :param output: output file name whhere the text will be written """ self.tokenizer = Tokenizer.Tokenizer(input) self.writer = VMWriter.VMWriter(output) self.symbolTable = SymbolTable.SymbolTable() self.classname = "" self.name = ""
def analyze(jackFile): outputFilename = os.path.splitext(jackFile)[0] + ".xml.cmp" t = Tokenizer.Tokenizer(jackFile) ce = CompilationEngine.CompilationEngine(t, outputFilename) t.advance() if t.keyword() != "class": print("jack file does not have a class!") exit(1) ce.CompileClass()
def phraseQuery(phrase, trie): # First we will tokenize the phrase using our tokenizer we used to make the index tokenizer = Tokenizer() phrase = tokenizer.stemQuery(phrase) result = [] # Now get the occurrence dictionary for each word and append it to result for word in phrase: occurrences = trie.getOccurrences(word) # If the dict is empty do not append it if (occurrences != []): result.append(occurrences) # We now know that if length of result is smaller than phrase that we have no phrase matches # so we can return an empty list as the result # or no matches or something if (len(phrase) != len(result)): return [] # Now for each key we must see if its in the next one, and if it is make sure the positions are correct # If we wanted to optimize this further we would make it choose the word with the smallest amount of occurrences # But that complicates things and goes above what is required result2 = set() requiredMatches = len(result) - 1 # Same thing, check for no occurences if result == []: return set() firstTerm = result[0] # print(result) for docID in firstTerm: # For each docID we must compare each position with position+1 in the other dictionaries matches = 0 positions = firstTerm[docID] # print(result, positions) # If 0 occurrences if positions == 0: continue for position in positions: position2 = position + 1 for i in range(1, requiredMatches + 1): # Make sure word appears in the same document and position+1 exists if (docID in result[i] and position2 in result[i][docID]): matches += 1 position2 += 1 # print(docID, position, position2, positions, matches) if (matches == requiredMatches): result2.add(docID) return result2
def compileJack(jackFile): outputFilename = os.path.splitext(jackFile)[0] + ".vm" t = Tokenizer.Tokenizer(jackFile) vmw = VMWriter.VMWriter(outputFilename) ce = CompilationEngine.CompilationEngine(t, vmw) t.advance() if t.keyword() != "class": print("jack file does not have a class!") exit(1) ce.CompileClass() vmw.close()
def main(argv=None): t = Tokenizer.Tokenizer() e = Evaluator.Evaluator() if len(argv) == 1: try: argv = input("Enter an expression: ") except [IOError, ValueError]: result = "You've entered an invalid expression!" else: argv = sys.argv[1] rpn = t.shunting(argv) result = e.evaluate(rpn) print(result)
def main(): #fileReader = FileReader.FileReader("testFile.txt") fileReader = FileReader.FileReader("Main.jack") tokenizer = Tokenizer.Tokenizer(fileReader) while not tokenizer.done: tokenizer.getToken() tokenizer.fillDict() #tokenizer.printFormatted() tokenizer.printTokens() tokenizer.toFile("MainT.xml") fileReader = FileReader.FileReader("SquareGame.jack") tokenizer = Tokenizer.Tokenizer(fileReader) while not tokenizer.done: tokenizer.getToken() tokenizer.fillDict() # tokenizer.printFormatted() tokenizer.printTokens() tokenizer.toFile("SquareGameT.xml")
def __init__(self, inp): self.toker = Tokenizer(inp) self.saved = [] # pushed-back tokens self.first = True self.currentFile = "" self.currentSymTable = {} self.localCounter = 0 self.className = "" self.classTable = {} self.ifCounter = 0 self.elseCounter = 0 self.whileCounter = 0 self.fieldCounter = 0 self.staticCounter = 0
def __init__(self, str): self.tokenizer = Tokenizer.Tokenizer(str) self.kOperator = "operator" self.kNumber = "number" self.unitRegex = re.compile("[a-zA-Z]") self.regexes = { re.compile("[-+\\*/\^()]|in$|to$"): self.parseOperator, re.compile("[-+]?[0-9]+\\.?[0-9]*"):self.parseNumber, re.compile("[-+]?[0-9]*\\.?[0-9]+"):self.parseNumber, self.unitRegex: self.parseUnit } self.parsingConversion = False
def __init__(self, batch_size, tokenizer_max_words=10000, pad_len=None, validation_ratio=0.2, data_loc='data/', retokenize=False): """ :param batch_size: Batch size :param num_backet: the buckets will fill up with row data, and then we choose randomly from each bucket :param bucket_size: number of sample in each bucket, default is batch_size :param data_loc: the location of row data, we use it to load wikipedia files :param val_set: this persent of list files will hold for validation :param pad_len: None=Dynamic padding, else will be passed to tokenizer to pad sentences to equal length """ self.batch_size = batch_size self.pad_len = pad_len # because of computation limit, we have to fix 5 files. # # load all wikipedia files # self.wiki_files = [os.path.join(data_loc, f) # for f in os.listdir(data_loc) # if f.endswith(".txt")] # self.wiki_files = sorted(self.wiki_files) # temporary solution to shortened the data # self.wiki_files = self.wiki_files[0:5] # assign wiki_files to train_files and then remove val_files from train_files self.train_files = ["data/wiki_sentences2.txt", "data/wiki_sentences20.txt", "data/wiki_sentences32.txt", "data/wiki_sentences140.txt"] # choose validation files randomly # val_size = round(validation_ratio * len(self.wiki_files)) self.val_files = ["data/wiki_sentences98.txt"] # for i in range(val_size): # k = np.random.randint(0, len(self.wiki_files)) # # remove k-th filename from train_files list and append it to validation files list # self.val_files.append(self.train_files.pop(k)) self.train_data = wl.Wikiloader(files=self.train_files, title="train") self.train_len = self.train_data.__len__() self.val_data = wl.Wikiloader(files=self.val_files, title="test") self.val_len = self.val_data.__len__() # initial Tokenizer self.tokenizer = tk.Tokenizer(max_words=tokenizer_max_words, retokenize=retokenize) # fit tokenizer on train files if len(self.tokenizer.word2index) == 0: self.tokenizer.fit_on_texts(self.train_data)
def handle_include(self, tokens): (ttype, val, curline) = tokens.get() if ttype == T_STRING: fname = val.replace('"', '') self.named.includes.append(fname) report_info(' include file "%s"...' % (fname)) if os.path.exists(fname): self.parse(Tokenizer(fname)) else: report_error('? missing include file "%s"' % (fname)) else: bail('? dude. where is the filename string after "include"?') (ttype, val, curline) = tokens.get() if ttype != T_SEMI: bail('? need a semicolon to end the "include" (%d: %s)' % (ttype, val)) return
def constructFile(input): tokens = Tokenizer(input) parseTree = ParseTree() parseProgram(parseTree.head, tokens) #create a list of all the code term's to append to our file: nodeList = [] codeTermList = [] parseTree.writeTree(nodeList) #now with our full node list, let's use our emitter to fill out our actual C code term list: for node in nodeList: term = getEmitCodeTerm(node) #Since some key words return a None value, lets set those terms as an empty string if term is None: term = "" codeTermList.append(term) #now that our code term list is filled out, let's strip tokens that would be considered invalid in C cleanTermList(codeTermList) #now write our code terms into a file: file = open("./RunnableFiles/main.c", "w") for idx, term in enumerate(codeTermList): print(term) file.write(term)
def addDocsToTries(docs, trieTitle, trieBody): # Takes a list of document paths and adds them to the given tries # ID, title, body tokenizer = Tokenizer() for doc in docs: tokens = tokenizer.stemDocument(doc[2]) # Add titles to title trie position = 0 for word in doc[1]: trieTitle.addOccurence(tokenizer.qregex.sub("", word.lower()), doc[0], position) position += 1 position = 0 for token in tokens: # If the token has !?. in it, remove it then increment position if (token[-1] in ".?!" and token not in "mr. ms. mrs."): trieBody.addOccurence(token[:1], doc[0], position) position += 2 continue trieBody.addOccurence(token, doc[0], position) position += 1
def __init__(self): # initiate classes self.lexicons = Lexicons.Lexicons() self.tokenizer = Tokenizer.Tokenizer() self.symbolEffect = SymbolEffect.SymbolEffect() self.word2vecModeling = Word2VecModeling.Word2VecModel() self.getCosineScore = GetCosineSimilarity.GetCosineSimilarity() self.queryTermsScoring = QueryTermsScoring.QueryTermsScoring() self.dictClassZero = DictionaryOfTerms.DictionaryOfTerms() self.dictClassOne = DictionaryOfTerms.DictionaryOfTerms() self.dictClassTwo = DictionaryOfTerms.DictionaryOfTerms() self.dictClassThr = DictionaryOfTerms.DictionaryOfTerms() self.rndSelection = RandomTermSelection.RandomTermSelection() # initiate variables self.trainTweets = [] self.trainTokenizedTweet = [] self.testTweets = [] self.testTokenizedTweet = [] self.classOneTweetToken = [] self.classTwoTweetToken = [] self.classThrTweetToken = [] self.classForTweetToken = [] self.tfidf_scores = dict()