def tokenController(self): self.token_stream = self.getTokenStream() print self.filecontent #print '\n\nTOKEN STREAM GENERATED:', self.token_stream #print 'Now calling tokenizer' self.tokenizer = tokenizer() self.symbol_table = self.tokenizer.tokenize(self.token_stream) #print '\n\nSymTab: ', self.symbol_table return self.symbol_table
def __init__(self, path, indices=None, walk=5, blocklen=10): self.path = path self.binlist = [] self.walk = walk self.maxblocks = blocklen self.indices = indices self.token_id = tokenizer() self.MAXTOKEN = max(self.token_id.values()) self.CLS = self.MAXTOKEN + 2 self.MASK = self.MAXTOKEN + 3 self.PAD = 0 files = os.listdir(self.path) for fi in files: self.binlist.append(fi)
import sys from tokenizer import * for row in sys.stdin: parts = row.split("\t",1) filename = parts[0] try: tokens = tokenizer(parts[1]) except IndexError: logging.warning("Found no tab in the input for \n" + filename[:50] + "\n...skipping row") continue out= u" ".join(tokens.tokenize()) print out.encode("utf-8")
import sys from tokenizer import * import warnings for row in sys.stdin: parts = row.split("\t", 1) filename = parts[0] try: tokens = tokenizer(parts[1]) except IndexError: logging.warning("Found no tab in the input for \n" + filename[:50] + "\n...skipping row") continue out = u" ".join(tokens.tokenize()) print out.encode("utf-8")
def tag_locations(geonames, text, tokenizer=tokenizer.ner_tokenizer, out_graph=None): """Return a list of tuples where the first element is the location, the second element is the associated geoname id and the third element is an score between -1 and 1. """ # Extract locations names from text logging.info('tag_locations: extracting candidate tokens from text') locations = filter(lambda location: geonames.gid(location), tokenizer(text)) logging.info('tag_locations: building sub-graphs') subgraphs = filter(lambda x: x is not None, [ graph_location(geonames, pos, location) for pos, location in enumerate(locations) ]) gids = { H.graph['prefix']: set(gid for (_, gid) in H.nodes_iter()) for H in subgraphs } logging.info('tag_locations: union of sub-graphs') G = nx.Graph() for H in subgraphs: G.add_nodes_from(H.nodes_iter(data=True)) G.add_edges_from(H.edges_iter(data=True)) logging.info('tag_locations: joining sub-graphs') def join_graphs(G1, G2): p1 = G1.graph['prefix'] p2 = G2.graph['prefix'] for gid in (gids[p1] & gids[p2]): G.add_edge((p1, gid), (p2, gid), weight=1.0) last_window = [] for window in window_iter(subgraphs, 5): G1 = window[0] for G2 in window[1:]: join_graphs(G1, G2) last_window = window window = last_window if len(window) > 1: for G1, G2 in itertools.combinations(window[1:], 2): join_graphs(G1, G2) logging.info('tag_locations: Hopfield network activation') activation, index = propagate(G) for node, data in G.nodes_iter(data=True): data['score'] = float(activation[index[node]]) if out_graph is not None: nx.write_gexf(G, out_graph) geotags = [] for pos, location in enumerate(locations): best_score = None best_gid = None for gid in geonames.gid(location): try: score = activation[index[(pos, gid)]] except KeyError: continue if score > best_score: best_gid = gid best_score = score if best_gid: geotags.append((location, best_gid, best_score)) logging.info(u'{0} -> {1} (country: {2}, score: {3:.4f})'.format( location, best_gid, geonames.country(best_gid), best_score)) logging.info('tag_locations: done') return geotags
def tag_locations(geonames, text, tokenizer=tokenizer.ner_tokenizer, out_graph=None): """Return a list of tuples where the first element is the location, the second element is the associated geoname id and the third element is an score between -1 and 1. """ # Extract locations names from text logging.info('tag_locations: extracting candidate tokens from text') locations = filter( lambda location: geonames.gid(location), tokenizer(text)) logging.info('tag_locations: building sub-graphs') subgraphs = filter(lambda x: x is not None, [graph_location(geonames, pos, location) for pos, location in enumerate(locations)]) gids = {H.graph['prefix']: set(gid for (_, gid) in H.nodes_iter()) for H in subgraphs} logging.info('tag_locations: union of sub-graphs') G = nx.Graph() for H in subgraphs: G.add_nodes_from(H.nodes_iter(data=True)) G.add_edges_from(H.edges_iter(data=True)) logging.info('tag_locations: joining sub-graphs') def join_graphs(G1, G2): p1 = G1.graph['prefix'] p2 = G2.graph['prefix'] for gid in (gids[p1] & gids[p2]): G.add_edge((p1, gid), (p2, gid), weight=1.0) last_window = [] for window in window_iter(subgraphs, 5): G1 = window[0] for G2 in window[1:]: join_graphs(G1, G2) last_window = window window = last_window if len(window) > 1: for G1, G2 in itertools.combinations(window[1:], 2): join_graphs(G1, G2) logging.info('tag_locations: Hopfield network activation') activation, index = propagate(G) for node, data in G.nodes_iter(data=True): data['score'] = float(activation[index[node]]) if out_graph is not None: nx.write_gexf(G, out_graph) geotags = [] for pos, location in enumerate(locations): best_score = None best_gid = None for gid in geonames.gid(location): try: score = activation[index[(pos, gid)]] except KeyError: continue if score > best_score: best_gid = gid best_score = score if best_gid: geotags.append((location, best_gid, best_score)) logging.info(u'{0} -> {1} (country: {2}, score: {3:.4f})'.format( location, best_gid, geonames.country(best_gid), best_score)) logging.info('tag_locations: done') return geotags
from data import * from tokenizer import * from model import * from caption import * from pickle import dump from tokenizer import * # training dataset filename = 'Dataset/Flickr8k_text/Flickr_8k.trainImages.txt' trainData = loadSet(filename) trainDescriptions = loadDescriptions('Preprocessed Features/descriptions.txt', trainData) trainFeatures = loadFeatures('Preprocessed Features/features.pkl', trainData) tokenizer = tokenizer(trainDescriptions) dump(tokenizer, open('Preprocessed Features/tokenizer.pkl', 'wb')) vocabSize = len(tokenizer.word_index) + 1 maxLength = maxLength(trainDescriptions) def dataGenerator(descriptions, photos, tokenizer, maxLength): while 1: for key, descriptionList in descriptions.items(): try: photo = photos[key][0] inImage, inSequence, outWord = createSequence( tokenizer, maxLength, descriptionList, photo, vocabSize) yield [[inImage, inSequence], outWord] except: pass
import sys from pathlib import Path from tokenizer import * from jack_to_xml_parser import * try: PROJECT_PATH = Path(sys.argv[1]) except IndexError: PROJECT_PATH = Path(input("type file path: ")) filename = PROJECT_PATH.parent / (PROJECT_PATH.stem + "_new.xml") if __name__ == "__main__": with open(PROJECT_PATH) as jack_file: content = jack_file.read() instructions = extract_instructions_only(content) token_sample = tokenizer(instructions) compile_to_xml(token_sample, filename)
def tokenize(self, text): text = [str(w) for w in tokenizer(text)] return text
def main(): # First, remove all the comments with open("finalv1.txt") as source_file: new_file = open('finalv2.txt', mode='w+', encoding='utf-8') comment_remover(source_file, new_file) new_file.close() # Clean the spaces content = clean_text('finalv2.txt') token_list = tokenizer(content) if token_list == -1: exit(1) terminal_list = [ 'P', 'Q', 'R', 'S', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'PROGRAM', 'BEGIN', 'END.', 'INTEGER', 'PRINT', '+', '-', '/', '*', '(', ')', ',', ';', '=', ':', '$' ] predict_table = { 'W': { 'P': 'aaa', 'Q': 'aaa', 'R': 'aaa', 'S': 'aaa', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'PROGRAM B ; D BEGIN I END.', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'B': { 'P': 'UC', 'Q': 'UC', 'R': 'UC', 'S': 'UC', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'C': { 'P': 'UC', 'Q': 'UC', 'R': 'UC', 'S': 'UC', '0': 'OC', '1': 'OC', '2': 'OC', '3': 'OC', '4': 'OC', '5': 'OC', '6': 'OC', '7': 'OC', '8': 'OC', '9': 'OC', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'lambda', '-': 'lambda', '/': 'lambda', '*': 'lambda', '(': 'aaa', ')': 'lambda', ',': 'lambda', '.': 'aaa', ';': 'lambda', '=': 'lambda', ':': 'aaa', '$': 'aaa' }, 'D': { 'P': 'aaa', 'Q': 'aaa', 'R': 'aaa', 'S': 'aaa', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'H : G ;', 'PRINT': 'aaa', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'G': { 'P': 'BZ', 'Q': 'BZ', 'R': 'BZ', 'S': 'BZ', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'Z': { 'P': 'aaa', 'Q': 'aaa', 'R': 'aaa', 'S': 'aaa', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': ', G', '.': 'aaa', ';': 'lambda', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'H': { 'P': 'aaa', 'Q': 'aaa', 'R': 'aaa', 'S': 'aaa', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'INTEGER', 'PRINT': 'aaa', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'I': { 'P': 'JV', 'Q': 'JV', 'R': 'JV', 'S': 'JV', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'JV', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'V': { 'P': 'I', 'Q': 'I', 'R': 'I', 'S': 'I', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'lambda', 'INTEGER': 'aaa', 'PRINT': 'I', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'J': { 'P': 'A', 'Q': 'A', 'R': 'A', 'S': 'A', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'K', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'K': { 'P': 'aaa', 'Q': 'aaa', 'R': 'aaa', 'S': 'aaa', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'PRINT ( B ) ;', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'A': { 'P': 'B = E ;', 'Q': 'B = E ;', 'R': 'B = E ;', 'S': 'B = E ;', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'E': { 'P': 'TX', 'Q': 'TX', 'R': 'TX', 'S': 'TX', '0': 'TX', '1': 'TX', '2': 'TX', '3': 'TX', '4': 'TX', '5': 'TX', '6': 'TX', '7': 'TX', '8': 'TX', '9': 'TX', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'TX', '-': 'TX', '/': 'aaa', '*': 'aaa', '(': 'TX', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'X': { 'P': 'aaa', 'Q': 'aaa', 'R': 'aaa', 'S': 'aaa', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': '+TX', '-': '-TX', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'lambda', ',': 'aaa', '.': 'aaa', ';': 'lambda', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'T': { 'P': 'FY', 'Q': 'FY', 'R': 'FY', 'S': 'FY', '0': 'FY', '1': 'FY', '2': 'FY', '3': 'FY', '4': 'FY', '5': 'FY', '6': 'FY', '7': 'FY', '8': 'FY', '9': 'FY', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'FY', '-': 'FY', '/': 'aaa', '*': 'aaa', '(': 'FY', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'Y': { 'P': 'aaa', 'Q': 'aaa', 'R': 'aaa', 'S': 'aaa', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'lambda', '-': 'lambda', '/': '/FY', '*': '*FY', '(': 'aaa', ')': 'lambda', ',': 'aaa', '.': 'aaa', ';': 'lambda', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'F': { 'P': 'B', 'Q': 'B', 'R': 'B', 'S': 'B', '0': 'N', '1': 'N', '2': 'N', '3': 'N', '4': 'N', '5': 'N', '6': 'N', '7': 'N', '8': 'N', '9': 'N', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'N', '-': 'N', '/': 'aaa', '*': 'aaa', '(': '( E )', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'N': { 'P': 'aaa', 'Q': 'aaa', 'R': 'aaa', 'S': 'aaa', '0': 'LOM', '1': 'LOM', '2': 'LOM', '3': 'LOM', '4': 'LOM', '5': 'LOM', '6': 'LOM', '7': 'LOM', '8': 'LOM', '9': 'LOM', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'LOM', '-': 'LOM', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'L': { 'P': 'aaa', 'Q': 'aaa', 'R': 'aaa', 'S': 'aaa', '0': 'lambda', '1': 'lambda', '2': 'lambda', '3': 'lambda', '4': 'lambda', '5': 'lambda', '6': 'lambda', '7': 'lambda', '8': 'lambda', '9': 'lambda', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': '+', '-': '-', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'M': { 'P': 'aaa', 'Q': 'aaa', 'R': 'aaa', 'S': 'aaa', '0': 'OM', '1': 'OM', '2': 'OM', '3': 'OM', '4': 'OM', '5': 'OM', '6': 'OM', '7': 'OM', '8': 'OM', '9': 'OM', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'lambda', '-': 'lambda', '/': 'lambda', '*': 'lambda', '(': 'aaa', ')': 'lambda', ',': 'aaa', '.': 'aaa', ';': 'lambda', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'O': { 'P': 'aaa', 'Q': 'aaa', 'R': 'aaa', 'S': 'aaa', '0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, 'U': { 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', '0': 'aaa', '1': 'aaa', '2': 'aaa', '3': 'aaa', '4': 'aaa', '5': 'aaa', '6': 'aaa', '7': 'aaa', '8': 'aaa', '9': 'aaa', 'PROGRAM': 'aaa', 'BEGIN': 'aaa', 'END.': 'aaa', 'INTEGER': 'aaa', 'PRINT': 'aaa', '+': 'aaa', '-': 'aaa', '/': 'aaa', '*': 'aaa', '(': 'aaa', ')': 'aaa', ',': 'aaa', '.': 'aaa', ';': 'aaa', '=': 'aaa', ':': 'aaa', '$': 'aaa' }, } parse_status = predictive_parser(token_list, predict_table, terminal_list, starting_symbol='W') if parse_status is True: code_generator(content.split('\n'), 'main.cpp')