def __init__(self, input_file, reuse): self.scope = {} self.function = None self.loop = None self.tokens = Tokens(input_file) self.allocator = Allocator(reuse) self.structs = []
def retrieval_cosine(self, query): scores = dict() tokens = Tokens() query_terms = tokens.edit_query(query) query_weights = dict(collections.Counter(query_terms).items()) for query in query_weights: query_weights[query] = tf(query_weights[query]) for query in query_weights: term = query doc_ids = self.L(term) for doc in doc_ids: if doc[0] not in scores: scores[doc[0]] = tf(doc[1]) * tf(query_weights[term]) else: new_value = scores[doc[0]] + tf(doc[1]) * tf( query_weights[term]) scores[doc[0]] = new_value for id in scores: norm_value = self.inverted_index.get_norms(str(id)) scores[id] = round( scores[id] / (norm_value * self.get_query_norms(query_weights)), 2) scores = sorted(scores.items(), key=lambda kv: kv[1], reverse=True) return scores
def set_index_dic(self, my_tweets_dic): for tweet_id in self.my_tweets_dic: tokens = Tokens() tokens.remove_stopwords(self.my_tweets_dic[tweet_id]) current_id = tweet_id for word in tokens.reduced_tokens: self.replace_in_dic(word, current_id) sorted_dic = dict(sorted(self.dic.items())) return sorted_dic
def __init__(self, feature_for_separate_model=False): super(FeatureGenerator, self).__init__() self.dictionary = enchant.Dict('en_US') self.token_generator = Tokens() # Connection established! self.record = None self.tokens = [] self.features = None # list of list of features for every name; e.g. [[1,1,1,1],[...], ...] # Regex setup self.NUM_REGEX = re.compile('\d') self.CHAR_DIGIT_MIX_REGEX = re.compile('((^[a-zA-Z]+\d{4}$)|(^[a-zA-Z]+\d{2}$))|((^\d{4}[a-zA-Z]+$)|(^\d{2}[a-zA-Z]+$))', re.MULTILINE) self.NAME_ABBREV_REGEX = re.compile('([A-Z]\.-[A-Z]\.)|([A-Z]\.-[A-Z])|([A-Z]\.-)|(([A-Z]\.)+)|(O\'[A-Z][a-z]+)') #C.P.; C.-C.; O'Reilly self.PAGE_NO_REGEX = re.compile('\d+-\d+') # Gazzatte setup self.DELIMITERS = [',', '.', ] self.LBRACKET = ['(', '[', '{', '<', ] self.RBRACKET = [')', ']', '}', '>', ] self.APOSTROPHES = ["'s", "'re", "'d", ] self.QUOTATIONS = ['"', "''", "``", ] self.MONTHS = ['Janurary', 'February', 'March', 'April','May','June','July','August','September','October','November','December'] self.NAME_LIST = [item.strip() for item in open('data/name.lst','r').readlines()] self.VENUE_LIST = [item.strip() for item in open('data/venue.lst','r').readlines()] self.ORDINAL_LIST = [item.strip() for item in open('data/ordinal.lst','r').readlines()] # self.CITY_LIST = [item.strip() for item in open('data/cities.lst','r').readlines()] self.COUNTRY_LIST = [item.strip() for item in open('data/countries.lst','r').readlines()] if feature_for_separate_model: self.pipeline = PARTIAL_PIPELINE else: self.pipeline = STANDARD_PIPELINE
def remove_cattributes(self,bot,update,args): try: with closing(self.conn.cursor()) as cur: uid = update.message.from_user.id try: cattribute_to_remove = str.lower(args[0]) except IndexError: reply_message = "Can't remove an empty string! Please use the following format: /rmcattributes cattributename" update.message.reply_text(reply_message,parse_mode='HTML') else: cur.execute("""SELECT * FROM Attribute WHERE telegram_id = %s AND LOWER(attribute_name) = LOWER(%s) """,(uid,cattribute_to_remove,)) if cur.rowcount == 0: reply_message = "Can't find that cattribute! please use /listcattributes to show your cattributes" update.message.reply_text(reply_message,parse_mode='HTML') else: cur.execute("""DELETE FROM Attribute WHERE telegram_id = %s AND LOWER (attribute_name) = LOWER(%s) """,(uid,cattribute_to_remove,)) reply_message = "".join([str.lower(cattribute_to_remove), " has been sucessfully removed"]) update.message.reply_text(reply_message,parse_mode='HTML') except Exception as e: catcherror = traceback.format_exc() bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
def process_multiple(log, do_fetch=True, do_parse=True, do_merge=True): root = config["data-dir"] if do_fetch: tokens = Tokens() api = API(tokens, log) util.delete_files(root + '/processing/invoices', '*.json') success, invoice_cnt = api.fetch_invoice_details(hours_delta=30, tz_offset=7) if success and invoice_cnt > 0: log.write( "INFO api invoices extraction succeeded {:,} invoices saved to : {}" .format(invoice_cnt, '/processing/invoices')) elif success and invoice_cnt == 0: log.write( "INFO api no invoices extracted (no new/updated invoices in refresh period)" ) return True else: log.write( "ERROR api invoices extraction failed {:,} invoices saved to : {}" .format(invoice_cnt, '/processing/invoices')) return False if do_parse: util.delete_files(root + '/processing/invoices', '*.csv') parser = Parser(log) parser.parse('invoices-line-items') if do_merge: merger = Merger(log) merger.merge_invoice_delta() return True
def process_single(log, do_fetch=True, do_parse=True): root = config["data-dir"] if do_fetch: tokens = Tokens() api = API(tokens, log) util.delete_files(root + '/processing/default', '*.json') if not api.fetch_data("items"): return False if not api.fetch_data("branding-themes"): return False if not api.fetch_data("contacts"): return False if not api.fetch_data("invoices"): return False if do_parse: util.delete_files(root + '/processing/default', '*.csv') parser = Parser(log) parser.parse('branding-themes') parser.parse('items') parser.parse('contacts') parser.parse('invoices') return True
def add_cattributes(self,bot,update,args): try: with closing(self.conn.cursor()) as cur: uid = update.message.from_user.id try: cattribute_to_add = str.lower(args[0]) except IndexError: reply_message = "Can't add an empty string! Please use the following format: /addcattributes cattributename" update.message.reply_text(reply_message,parse_mode='HTML') else: cur.execute("""SELECT * FROM Attribute WHERE telegram_id = %s AND LOWER(attribute_name) = LOWER(%s) """,(uid,cattribute_to_add,)) if cur.rowcount > 0: reply_message = "This cattribute is already added!" update.message.reply_text(reply_message,parse_mode='HTML') else: cur.execute("""INSERT INTO Attribute VALUES(%s,%s)""",(uid,cattribute_to_add)) reply_message = "".join([str.lower(cattribute_to_add)," has been added to the table"]) update.message.reply_text(reply_message,parse_mode='HTML') except Exception as e: catcherror = traceback.format_exc() bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
def __init__(self, record): super(AuthorFeatureBuilder, self).__init__() self.record = record self.tokens = Tokens(record).tokens self.num_tokens = len(self.tokens) self.features = None # list of list of features for every name; e.g. [[1,1,1,1],[...], ...] self.NUM_REGEX = re.compile('\d') self.DELIMITERS = [ ',', '.', ';', ] self.NAME_LIST = [ item.strip() for item in open('data/name.lst', 'r').readlines() ] self.pipeline = [ 'f_is_capitalized', 'f_is_all_upper', 'f_is_english', 'f_is_punctuation', 'f_is_sequential_punctuation', 'f_has_digit', 'f_is_all_digit', 'f_is_in_namelist', 'f_is_fname_abbrev', 'f_is_preceeded_by_delimiter', 'f_is_followed_by_delimiter', 'f_is_an_and_between_two_names', ] self.build()
def decode_without_constraints(self, segment): print segment observation_sequence, decoded_sequence = self.HMMentire.decode_without_constraints( segment) self.observations_raw.append(segment) self.observation_sequences.append(observation_sequence) self.labels.append(decoded_sequence) for vector, decoding, token in zip(observation_sequence, decoded_sequence, Tokens(segment).tokens): if decoding == 0: label = 'FN' elif decoding == 1: label = 'LN' elif decoding == 2: label = 'DL' elif decoding == 3: label = 'TI' elif decoding == 4: label = 'VN' elif decoding == 5: label = 'YR' else: label = str(decoding) + ', PROBLEM' print vector, '\t', label, '\t', token print '\n\n'
def main(wordCorpus): min_df = 2 tokenType = 'stopped' if (wordCorpus == 'twenty-news'): groupIndices = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] elif (wordCorpus == 'acl-imdb'): groupIndices = [0, 1] nClusters = len(groupIndices) for groupIndex in groupIndices: tokensLists, className = Tokens(wordCorpus).getTokens( tokenType, groupIndex) flat_list = [ tokens for tokensList in tokensLists for tokens in tokensList ] text = ' '.join(flat_list) wordcloud = WordCloud(max_font_size=40, width=600, height=400, background_color='white', max_words=200, relative_scaling=1.0).generate_from_text(text) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") wordcloud.to_file('./results/' + className + '.jpg')
def decode(self, segment): # print segment observation_sequence, decoded_sequence = self.HMMentire.decode(segment) self.observations_raw.append(segment) self.observation_sequences.append(observation_sequence) self.labels.append(decoded_sequence) # segment the labeling into parts author_field = [] title_field = [] venue_field = [] year_field = [] raw_tokens = Tokens(segment).tokens for i in range(len(decoded_sequence)): token_i = raw_tokens[i] label_i = decoded_sequence[i] if label_i in [0, 1]: author_field.append(token_i) if label_i == 2: continue if label_i == 3: title_field.append(token_i) if label_i == 4: venue_field.append(token_i) if label_i == 5: year_field.append(token_i) return ' '.join(author_field), ' '.join(title_field), ' '.join( venue_field), list(set(year_field))
def translate(self, readline, result=None, no_imports=None): # Tracker to keep track of information as the file is processed self.tokens = Tokens(self.default_kls) self.tracker = Tracker(result, self.tokens, self.wrapped_setup) # Add import stuff at the top of the file if self.import_tokens and no_imports is not True: self.tracker.add_tokens(self.import_tokens) # Looking at all the tokens with self.tracker.add_phase() as tracker: for tokenum, value, (_, scol), _, _ in generate_tokens(readline): self.tracker.next_token(tokenum, value, scol) # Add attributes to our Describes so that the plugin can handle some nesting issues # Where we have tests in upper level describes being run in lower level describes if self.with_describe_attrs: self.tracker.add_tokens(self.tracker.make_describe_attrs()) # If setups should be wrapped, then do this at the bottom if self.wrapped_setup: self.tracker.add_tokens(self.tracker.wrapped_setups()) # Add lines to bottom of file to add __testname__ attributes self.tracker.add_tokens(self.tracker.make_method_names()) # Return translated list of tokens return self.tracker.result
def find_venue_boundary_tokens(self): recorder = {} for raw_segment, observation_sequence, label_sequence in zip( self.raw_segments, self.observation_sequences, self.label_sequences): first_target_label_flag = True tokens = Tokens(raw_segment).tokens for token, feature_vector, label in zip(tokens, observation_sequence, label_sequence): # First meet a VN label if label == 4 and first_target_label_flag: key = token.lower() if not key.islower(): continue if recorder.has_key(key): recorder[key] += 1 else: recorder[key] = 1 first_target_label_flag = False elif (first_target_label_flag is False) and label in [0, 1, 3]: first_target_label_flag = True for k, v in recorder.iteritems(): print k, '\t', v return recorder
def Cryptokitties(): print("Cryptokitties online") updater = Updater(token=Tokens.bot_token("live")) dispatcher = updater.dispatcher j = updater.job_queue job_minute = j.run_repeating(Commands.broadcast, 150, 0) job_minute = j.run_repeating(Commands.kleongbroadcast, 150, 0) updater.start_polling() updater.idle
def __init__(self): Token = Tokens() self.reddit = praw.Reddit(client_id=Token['client_id'], client_secret=Token['client_secret'], user_agent='my-user-agent', username=Token['username'], password=Token['password'])
def get_prev_token(self): self.pos -= 2 tok = Tokens(self.code[self.pos] + " " + self.code[self.pos+1]) if tok.type == TokensType.INVALID: raise InterpreterException(f"Invalid token at position : {self.pos}") return tok
def __init__(self, input_file, reuse, initialize_memory): self.symbols_defined_in_current_scope = {} self.symbols_defined_stack = [] self.scope = {} self.scope_stack = [] self.function = None self.loop = None self.tokens = Tokens(input_file) self.allocator = Allocator(reuse) self.structs = [] self.initialize_memory = initialize_memory
def getX(wordCorpus, tokenType, groupIndices): XAll = [] indexList = {} start = 0 for groupIndex in groupIndices: X, className = Tokens(wordCorpus).getTokens(tokenType, groupIndex) end = start + len(X) indexList[className] = {'start': start, 'end': end} XAll = XAll + X start = end XAll = np.array([np.array(xi) for xi in XAll]) # rows: Docs. columns: words return XAll, indexList
def cancel(self,bot,update): try: with closing(self.conn.cursor()) as cur: uid = update.message.from_user.id message = "And here I was, thinking we could be friends :(" update.message.reply_text(message,parse_mode='HTML') cur.execute("""DELETE FROM User WHERE telegram_id = %s""",(uid,)) cur.execute("""DELETE FROM Attribute WHERE telegram_id = %s""",(uid,)) return ConversationHandler.END except Exception as e: catcherror = traceback.format_exc() bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
def token_interface(): if request.method == 'POST': phone_number = request.form['phone_number'] dept_id = request.form['departments'] stream_id = request.form['streams'] token_day_number = _get_current_token() attending = 1 token = Tokens(token_day_number=token_day_number, phone_number=phone_number, department=dept_id, stream=stream_id) db.session.add(token) db.session.commit() dept_name = Departments.query.filter_by(id=dept_id).first().name stream_name = Streams.query.filter_by(id=stream_id).first().name return render_template('generated_token.html', dept_name=dept_name, token_number=token_day_number, attending=attending, token=token, stream_name=stream_name) departments = Departments.query.all() streams = Streams.query.all() return render_template('token_interface.html', departments= departments, streams=streams)
def run(self): i = 0 self.new_labels = [] for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences): new_labels = self.hmm_new.decode(raw_segment)[1] self.new_labels.append(new_labels) tokens = Tokens(raw_segment).tokens feature_vectors = FeatureGenerator(raw_segment).features print i, ': ', raw_segment for token, old_label, new_label, feature_vector in zip( tokens, label_sequence, new_labels, feature_vectors): print to_label(old_label), '\t', to_label( new_label), '\t', token self.feature_entity_list.add_entity( feature_vector, old_label, token) #???? Old label first print '\n' i += 1
def forget(self,bot,update): try: with closing(self.conn.cursor()) as cur: uid = update.message.from_user.id cur.execute("""SELECT * FROM User WHERE telegram_id = %s""",(uid,)) if cur.rowcount == 0: message = "Can't delete what doesn't exist, man" update.message.reply_text(message,parse_mode='HTML') else: cur.execute("""DELETE FROM User WHERE telegram_id = %s""",(uid,)) cur.execute("""DELETE FROM Attribute WHERE telegram_id = %s""",(uid,)) message = "Oh, I'll tell you all about it when I see you again" update.message.reply_text(message,parse_mode='HTML') except Exception as e: catcherror = traceback.format_exc() bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
def list_cattributes(self,bot,update): try: with closing(self.conn.cursor()) as cur: uid = update.message.from_user.id cur.execute("""SELECT * FROM Attribute WHERE telegram_id = %s""",(uid,)) if cur.rowcount == 0: message = "You have no cattributes listed!" update.message.reply_text(message,parse_mode='HTML') else: cattributes = cur.fetchall() catlist = [x[1] for x in cattributes] message = "".join(['Your current cattributes are: ',(", ".join(catlist))]) update.message.reply_text(message,parse_mode='HTML') except Exception as e: catcherror = traceback.format_exc() bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
def getX(wordCorpus, tokenType, listOfClasses): XAll = [] indexList = {} groupIndices = listOfClasses.split(',') start = 0 for groupIndex in groupIndices: X, className = Tokens(wordCorpus).getTokens(tokenType, groupIndex) end = start + len(X) indexList[className] = {'start': start, 'end': end} logger.info('True Group Index {}, classname: {}'.format( groupIndex, className)) logger.info('Count {}, start - End Indices {} , {}'.format( len(X), start, end)) XAll = XAll + X start = end XAll = np.array([np.array(xi) for xi in XAll]) # rows: Docs. columns: words logger.info('indexList{}'.format(indexList)) return XAll, indexList
def Cryptokitties(): print("Cryptokitties online") updater = Updater(token=Tokens().bot_token()) dispatcher = updater.dispatcher # registering for users to a database. conv_handler = ConversationHandler( entry_points=[CommandHandler('register', Commands().register)], states={ GENERATION:[MessageHandler(Filters.text,Commands().generation)], COOLDOWN:[MessageHandler(Filters.text,Commands().cooldown)], OFFSTART:[MessageHandler(Filters.text,Commands().offstart)], OFFEND:[MessageHandler(Filters.text,Commands().offend)], ATTLIST: [MessageHandler(Filters.text,Commands().attribute_list)] }, fallbacks=[CommandHandler('cancel', Commands().cancel)], per_user = '******' ) dispatcher.add_handler(conv_handler,1) forget_handler = CommandHandler('forget', Commands().forget) dispatcher.add_handler(forget_handler) alert_handler = CommandHandler('alert',Commands().alert) dispatcher.add_handler(alert_handler) list_handler = CommandHandler('listcattributes',Commands().list_cattributes) dispatcher.add_handler(list_handler) remove_cattribute = CommandHandler('rmcattributes',Commands().remove_cattributes, pass_args=True) dispatcher.add_handler(remove_cattribute) add_cattribute = CommandHandler('addcattributes',Commands().add_cattributes, pass_args=True) dispatcher.add_handler(add_cattribute) ######################################################## # Alert jobs ######################################################## j = updater.job_queue job_minute = j.run_repeating(Commands().user_broadcast,600,0) updater.start_polling() updater.idle
def refresh_tokens(tokens: Tokens) -> Tokens: logging.debug("Try to refresh tokens") data = { "grant_type": "refresh_token", "refresh_token": tokens.refresh_token } headers = make_auth_http_headers() r = requests.post(url=URI_HH_OAUTH_TOKEN, headers=headers, data=data) logging.info(r.text) if r.status_code == 200: js = r.json() r = Tokens(access_token=js["access_token"], refresh_token=js["refresh_token"]) logging.debug("Tokens: %s", r) return r else: logging.error("status_code: %s; response: %s", r.status_code, r.text) return tokens
def attribute_list(self,bot,update): try: with closing(self.conn.cursor()) as cur: uid = update.message.from_user.id if str.lower(update.message.text) == "end": message = "Thanks for registering =)\n" message = "If you want to toggle 10 minute scans, please do a /alert" update.message.reply_text(message,parse_mode='HTML') return ConversationHandler.END else: cur.execute("""INSERT INTO Attribute VALUES(%s,%s)""",(uid,update.message.text,)) message= update.message.text message += " has been added as an attribute. Please enter the next attribute \n" message += "If you're done with adding your cattributes, please reply with end" update.message.reply_text(message,parse_mode='HTML') return ATTLIST except Exception as e: catcherror = traceback.format_exc() bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
def offend(self,bot,update): try: with closing(self.conn.cursor()) as cur: uid = update.message.from_user.id try: int(update.message.text) except ValueError: message = "Please send me an integer :)" update.message.reply_text(message,parse_mode='HTML') return OFFEND else: cur.execute("""UPDATE User SET offset_end = %s WHERE telegram_id = %s""",(update.message.text,uid,)) message = "Thank you, now, please key in a cattribute(one cattribute at a time only!)" message += "This bot will match the cattributes you are looking for" update.message.reply_text(message,parse_mode='HTML') return ATTLIST except Exception as e: catcherror = traceback.format_exc() bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
def cooldown(self,bot,update): try: with closing(self.conn.cursor()) as cur: uid = update.message.from_user.id try: int(update.message.text) except ValueError: message = "Please send me an integer :)" update.message.reply_text(message,parse_mode='HTML') return COOLDOWN else: cur.execute("""UPDATE User SET cooldown_index = %s WHERE telegram_id = %s""",(update.message.text,uid,)) message = "Fantastic. Now, may I please have the offset starting point?" message += "This bot will scan the api starting at the offset given. We recomend starting at 0" update.message.reply_text(message,parse_mode='HTML') return OFFSTART except Exception as e: catcherror = traceback.format_exc() bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
def generation(self,bot,update): try: with closing(self.conn.cursor()) as cur: uid = update.message.from_user.id try: int(update.message.text) except ValueError: message = "Please send me an integer :)" update.message.reply_text(message,parse_mode='HTML') return GENERATION else: cur.execute("""UPDATE User SET generation_index = %s WHERE telegram_id = %s""",(update.message.text,uid,)) message = "Fantastic. Now, may I please have a cooldown index?" message += "This bot will scan for the cooldown index less than the number that you input" update.message.reply_text(message,parse_mode='HTML') return COOLDOWN except Exception as e: catcherror = traceback.format_exc() bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
def __init__(self, record): super(VenueFeatureBuilder, self).__init__() self.record = record self.tokens = Tokens(record).tokens self.num_tokens = len(self.tokens) self.features = None self.NUM_REGEX = re.compile('\d') self.CHAR_DIGIT_MIX_REGEX = re.compile( '((^[a-zA-Z]+\d{4}$)|(^[a-zA-Z]+\d{2}$))|((^\d{4}[a-zA-Z]+$)|(^\d{2}[a-zA-Z]+$))', re.MULTILINE) self.DELIMITERS = [ ',', '.', ';', ] self.VENUE_LIST = [ item.strip() for item in open('data/venue.lst', 'r').readlines() ] self.ORDINAL_LIST = [ item.strip() for item in open('data/ordinal.lst', 'r').readlines() ] self.pipeline = [ 'f_is_capitalized', 'f_is_all_upper', 'f_is_english', 'f_has_both_char_and_digit', 'f_is_ordinal', 'f_is_punctuation', 'f_has_digit', 'f_is_all_digit', 'f_is_in_venuelist', 'f_is_preceeded_by_delimiter', 'f_is_followed_by_delimiter', 'f_is_followed_by_year', ] self.build()
class FeatureGenerator(object): """ @param: record -> piece of raw_text, or a list of tokens """ def __init__(self, feature_for_separate_model=False): super(FeatureGenerator, self).__init__() self.dictionary = enchant.Dict('en_US') self.token_generator = Tokens() # Connection established! self.record = None self.tokens = [] self.features = None # list of list of features for every name; e.g. [[1,1,1,1],[...], ...] # Regex setup self.NUM_REGEX = re.compile('\d') self.CHAR_DIGIT_MIX_REGEX = re.compile('((^[a-zA-Z]+\d{4}$)|(^[a-zA-Z]+\d{2}$))|((^\d{4}[a-zA-Z]+$)|(^\d{2}[a-zA-Z]+$))', re.MULTILINE) self.NAME_ABBREV_REGEX = re.compile('([A-Z]\.-[A-Z]\.)|([A-Z]\.-[A-Z])|([A-Z]\.-)|(([A-Z]\.)+)|(O\'[A-Z][a-z]+)') #C.P.; C.-C.; O'Reilly self.PAGE_NO_REGEX = re.compile('\d+-\d+') # Gazzatte setup self.DELIMITERS = [',', '.', ] self.LBRACKET = ['(', '[', '{', '<', ] self.RBRACKET = [')', ']', '}', '>', ] self.APOSTROPHES = ["'s", "'re", "'d", ] self.QUOTATIONS = ['"', "''", "``", ] self.MONTHS = ['Janurary', 'February', 'March', 'April','May','June','July','August','September','October','November','December'] self.NAME_LIST = [item.strip() for item in open('data/name.lst','r').readlines()] self.VENUE_LIST = [item.strip() for item in open('data/venue.lst','r').readlines()] self.ORDINAL_LIST = [item.strip() for item in open('data/ordinal.lst','r').readlines()] # self.CITY_LIST = [item.strip() for item in open('data/cities.lst','r').readlines()] self.COUNTRY_LIST = [item.strip() for item in open('data/countries.lst','r').readlines()] if feature_for_separate_model: self.pipeline = PARTIAL_PIPELINE else: self.pipeline = STANDARD_PIPELINE def close_connection(self): self.token_generator.close_connection() def build(self, record): self.record = record features = [] need_tokenize = True if type(self.record) is list: need_tokenize = False else: need_tokenize = True # record raw texts if need_tokenize: response_obj = self.token_generator.tokenize(self.record) self.tokens = response_obj['tokens'] # Already tokenized input else: self.tokens = self.record self.num_tokens = len(self.tokens) # count how many tokens are there in this piece of text. for i in range(self.num_tokens): sub_features = [] for pipe in self.pipeline: action = getattr(self, pipe) sub_features.append(action(i)) features.append(sub_features) self.features = features return features def token_length(self, record): return self.token_generator.token_length(record) def print_features(self): for i in range(self.num_tokens): print self.features[i], '\t\t', self.tokens[i] ################################### Feature functions ################################### # Feature output format: # [ # [([1,0,0,1], 1), ([1,1,1,1], 0), (...)...], <-- One piece of training sample (x, y) where x=x1x2x3...xm, y=y1y2y3...ym <-- a sentence representation in feature vectors, in sequence # [.......................], <-- another sentence, parallel with the previous sentence, independent processed # ... # ] # Assume segment is space-delimited, so it's a feature for the segmentm challenge will be tokenizing ################################### Local Features ##################################### # C.B. or C.-C def f_is_name_abbrev(self, idx): token = self.tokens[idx] if self.NAME_ABBREV_REGEX.match(token) is None: return 0 return 1 def f_is_apostrophes(self, idx): token = self.tokens[idx] return int(token in self.APOSTROPHES) def f_is_capitalized(self, idx): token = self.tokens[idx] if len(token) == 0: return 0 return int(token[0].isupper()) def f_is_all_upper(self, idx): token = self.tokens[idx] if len(token) <= 2: return 0 return int(token.isupper()) def f_is_english(self, idx): token = self.tokens[idx] if len(token) == 0: return 0 return int(self.dictionary.check(token.lower()) and len(token) > 1) def f_has_both_char_and_digit(self, idx): token = self.tokens[idx] if len(token) == 0: return 0 if self.CHAR_DIGIT_MIX_REGEX.search(token) is None: return 0 return 1 def f_is_delimiter(self, idx): token = self.tokens[idx] if len(token) != 1: return 0 return int(token in self.DELIMITERS) def f_is_quotation(self, idx): token = self.tokens[idx] return int(token in self.QUOTATIONS) def f_is_punctuation(self, idx): token = self.tokens[idx] if len(token) == 0: return 0 return int(len(token) == 1 and token in punctuation) # def f_is_sequential_punctuation(self, idx): #e.g. K.C.-C. Chang # token = self.tokens[idx] # if len(token) <= 1: # return 0 # ret = 1 # for t in token: # if t not in punctuation: # ret = 0 # break # return ret def f_has_digit(self, idx): token = self.tokens[idx] if len(token) == 0: return 0 if self.NUM_REGEX.search(token) is None: return 0 return 1 def f_is_all_digit(self, idx): token = self.tokens[idx] if len(token) == 0: return 0 return int(token.isdigit()) def f_is_possible_page_number(self, idx): token = self.tokens[idx] if self.PAGE_NO_REGEX.match(token) is None: return 0 return 1 def f_is_month(self, idx): token = self.tokens[idx] return int(token in self.MONTHS) def f_is_possible_year(self, idx): token = self.tokens[idx] if len(token) == 0: return 0 return int(token.isdigit() and len(token)==4 and int(token)>= 1980 and int(token)<=datetime.now().year) ################################### Dictionary Features ################################ def f_is_in_namelist(self, idx): token = self.tokens[idx].encode('ascii', 'ignore') if len(token) == 0: return 0 return int(token.lower().strip() in self.NAME_LIST) def f_is_ordinal(self, idx): token = self.tokens[idx] if len(token) == 0: return 0 return int(token.lower().strip() in self.ORDINAL_LIST) # Also handled some of the common venue tokens that are also common in English???? # TODO: more delicate def f_is_in_venuelist(self, idx): token = self.tokens[idx].encode('ascii', 'ignore') if len(token) == 0: return 0 if (idx-1) >= 0: prev_token = self.tokens[idx-1] else: prev_token = '' # Special case handling if token.strip() in ['In', 'Appear', 'Appears', 'Appeared', ] and len(prev_token)>0 and prev_token in ['.', ',', ';', '(', ]: return 1 return int(token.lower().strip() in (self.VENUE_LIST + self.ORDINAL_LIST + self.COUNTRY_LIST) ) ################################### Global Features #################################### def f_has_lbracket_before(self, idx): token = self.tokens[idx] if len(token) == 0: return 0 if (idx-1) >= 0: prev_token = self.tokens[idx-1] else: return 0 return int( prev_token in self.LBRACKET ) def f_has_rbracket_after(self, idx): token = self.tokens[idx] if (idx+1) < self.num_tokens: next_token = self.tokens[idx+1] else: return 0 return int( next_token in self.RBRACKET ) def f_has_quotation_before(self, idx): token = self.tokens[idx] if len(token) == 0: return 0 if (idx-1) >= 0: prev_token = self.tokens[idx-1] else: return 0 return int( prev_token in self.QUOTATIONS ) def f_has_quotation_after(self, idx): token = self.tokens[idx] if (idx+1) < self.num_tokens: next_token = self.tokens[idx+1] else: return 0 return int( next_token in self.QUOTATIONS ) #???? def f_is_possible_volume(self, idx): token = self.tokens[idx] if ((idx-1) >=0) and ((idx+1)<self.num_tokens): prev_token = self.tokens[idx-1] next_token = self.tokens[idx+1] return int(prev_token in self.LBRACKET and next_token in self.RBRACKET and token.isdigit()) else: return 0 # ???? necessary? def f_is_at_second_half_of_string(self, idx): token = self.tokens[idx] return int(idx > self.num_tokens/2) def f_has_delimiter_before(self, idx): token = self.tokens[idx] if len(token) == 0: return 0 if (idx-1) >= 0: prev_token = self.tokens[idx-1] else: return 0 return int(len(prev_token)==1 and prev_token in self.DELIMITERS) def f_has_delimiter_after(self, idx): token = self.tokens[idx] if (idx+1) < self.num_tokens: next_token = self.tokens[idx+1] else: return 0 return int( len(next_token)==1 and next_token in self.DELIMITERS) #???? def f_is_an_and_between_two_names(self, idx): token = self.tokens[idx] if (idx+1) < self.num_tokens and (idx-1)>=0: next_token = self.tokens[idx+1] prev_token = self.tokens[idx-1] else: return 0 return int(token.strip().lower()=='and' and self.f_is_capitalized(idx-1) and (self.f_is_english(idx-1)==0)) def f_is_followed_by_year(self, idx): token = self.tokens[idx] if (idx+1) < self.num_tokens: next_token = self.tokens[idx+1] else: return 0 return int((len(next_token)==2 or len(next_token)==4) and next_token.isdigit() and not token.isdigit()) # Addressing the possible new notions in the title of publications def f_is_possible_new_notion(self, idx): token = self.tokens[idx] if (idx+2) < self.num_tokens: next_token = self.tokens[idx+1] next_next_token = self.tokens[idx+2] else: return 0 p1 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+$', re.MULTILINE) p2 = re.compile(r'^[A-Z][a-z0-9]+$', re.MULTILINE) p3 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+[A-Z][a-z0-9]+$', re.MULTILINE) p4 = re.compile(r'^[a-z0-9]+$', re.MULTILINE) p5 = re.compile(r'[A-Z]*[A-Za-z]+-[A-Za-z]+') #specific terminology ???? content-aware; Group-By # Xxxxxx, XxxxxXxxxx, XxxxXxxxXxxx, Xxxx xxxx, Xxxx Xxxx, XXXX pattern_1 = token.isupper() and next_token==':' pattern_2 = (p1.match(token) is not None) and next_token==':' pattern_3 = (p2.match(token) is not None) and next_token==':' pattern_4 = (p3.match(token) is not None) and next_token==':' pattern_5 = (p2.match(token) is not None) and (p2.match(next_token) is not None) and next_next_token==':' pattern_6 = (p2.match(token) is not None) and (p4.match(next_token) is not None) and next_next_token==':' pattern_7 = p5.match(token) is not None return int(pattern_1 or pattern_2 or pattern_3 or pattern_4 or pattern_5 or pattern_6 or pattern_7) def f_is_possible_boundary(self, idx): #check if period. Pending feature token = self.tokens[idx] if (idx+1) < self.num_tokens and (idx-1)>=0: next_token = self.tokens[idx+1] prev_token = self.tokens[idx-1] else: return 0 return int( (token == '.' and prev_token.islower() and next_token[0].isupper()) or (token[-1]=='.' and token[0].islower() and next_token[0].isupper()) )
def get_training_samples(url): log_err('\tGetting Training sample') raw_results = router(url) log_err('\tData retrieved. Preprocessing...') observation_list = [] label_list = [] records = [] feature_generator = FeatureGenerator() token_generator = Tokens() for raw_result in raw_results: tmp_record = '' tmp_observation_list = [] tmp_label_list = [] authors = raw_result['authors'] title = raw_result['title'] title_copy = raw_result['title'] try: venue = raw_result['conference name'] venue_copy = raw_result['conference name'] except: venue = '' venue_copy = '' try: venue = raw_result['journal name'] venue_copy = raw_result['journal name'] except: venue = '' venue_copy = '' if len(venue) > 0: try: volume = raw_result['volume'] except: volume = '' try: issue = raw_result['issue'] except: issue = '' try: page = raw_result['page'] except: page = '' venue += ' ' + volume + ' ' + issue + ' ' + page venue_copy += ' ' + volume + ' ' + issue + ' ' + page date = raw_result['publication date'][:4] # FN: 0 # LN: 1 # DL: 2 # TI: 3 # VN: 4 # DT: 5 # Author -> Title -> ... # authors for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) tmp_label_list += [1,2] # title title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) tmp_label_list += [2] # venue if len(venue) > 0: venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) #=================================Variations of authors================================= # Changing order, inserting dot, and probably insert comma as delimiter inside of names # This part of variations is very sensitive to what sample source to choose from, # for example, Google scholar is the current source of samples, and on gscholar, # most names are in format of JW Han. <-- Prior knowledge # Read more Learn more Change the Globe !!! log_err('\tGenerating multiple cases for name variations... ') # ================================A. B tmp_record = '' tmp_observation_list = [] tmp_label_list = [] # authors for author in authors: if len(author) == 0: continue #???? BUG!!!! split() doesn't mean tokenization author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order tokens if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order to if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B, # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A., # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A. # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ # Period Case!!! log_err('\tGenerating multiple cases for period as DL... ') # Author -> Title -> ... # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title title = title_copy + ' . ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: venue = venue_copy + ' . ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================A. B tmp_record = '' tmp_observation_list = [] tmp_label_list = [] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order tokens if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order to if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B, # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A., # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A. # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # =============================================================================Verbose: Print the training set for record, observation, label in zip(records, observation_list, label_list): for rr, oo, ll in zip(token_generator.tokenize(record)['tokens'], observation, label): if ll == 0: ll = 'FN' elif ll == 1: ll = 'LN' elif ll == 2: ll = 'DL' elif ll == 3: ll = 'TI' elif ll == 4: ll = 'VN' elif ll == 5: ll = 'DT' print oo, '\t', ll.encode('utf-8'), '\t', rr.encode('utf-8') print '\n\n' return observation_list, label_list
def to_str(self, rels): s = "" for r in rels: if r.get("s", False): s = s + "{0} ({1}) - ".format(r["s"]["word"], r["s"]["tag"]) s = s + "{0} ({1}) - {2} ({3}) | ".format(r["v"]["word"], r["v"]["tag"], r["p"]["word"], r["p"]["tag"]) return s if __name__ == "__main__": from tokens import Tokens from pos_tags import PosTags import sys fn = sys.argv[1] t = Tokens() pos_tags = PosTags() relations = Relations() with open(fn) as f: for l in f: tokens = t.tokenize(l) pos = pos_tags.tag(tokens) rels = relations.find(pos) print(relations.to_str(rels))
import os import requests import json import datetime from tokens import Tokens # Environment variables must be set with your tokens tokens = Tokens() USER_TOKEN_STRING = tokens.get_user_token() class User: def __init__(self, user_id): # The Slack ID of the user self.id = user_id # The username (@username) and real name self.username, self.real_name = self.fetchNames() # A list of all exercises done by user self.exercise_history = [] # A record of all exercise totals (quantity) self.exercises = {} # A record of exercise counts (# of times) self.exercise_counts = {} # A record of past runs self.past_workouts = {}
class Parser: """Turn the C input file into a tree of expressions and statements.""" def __init__(self, input_file, reuse, initialize_memory): self.symbols_defined_in_current_scope = {} self.symbols_defined_stack = [] self.scope = {} self.scope_stack = [] self.function = None self.loop = None self.tokens = Tokens(input_file) self.allocator = Allocator(reuse) self.structs = [] self.initialize_memory = initialize_memory def add_to_scope(self, name, obj): #if the name has already been defined in the current scope, error out. if name in self.symbols_defined_in_current_scope: self.tokens.error("%s is already defined"%name) self.symbols_defined_in_current_scope[name]=obj self.scope[name]=obj def enter_scope(self): self.symbols_defined_stack.append(self.symbols_defined_in_current_scope) #stack holds everything that has been defined in a underlying scope self.scope_stack.append(copy(self.scope)) self.symbols_defined_in_current_scope = {} def leave_scope(self): self.symbols_defined_in_current_scope = self.symbols_defined_stack.pop() self.scope = self.scope_stack.pop() def parse_process(self): process = Process() process.allocator = self.allocator process.inputs = [] process.outputs = [] process.functions = [] while not self.tokens.end(): if self.tokens.peek() == "struct": self.parse_define_struct() elif self.tokens.peek() == "typedef": self.parse_typedef_struct() else: function = self.parse_function() if function is not None: process.functions.append(function) process.main = self.main return process def parse_type_specifier(self): type_specifiers = [] while self.tokens.peek() in types + self.structs + storage_specifiers: type_specifiers.append(self.tokens.get()) signed = True if "unsigned" in type_specifiers: signed = False if "signed" in type_specifiers: self.tokens.error("Cannot be signed and unsigned") size = 2 if "long" in type_specifiers: if "short" in type_specifiers: self.tokens.error("Cannot be long and short") size = 4 type_ = "int" for i in type_specifiers: if i in self.structs: type_ = i size = 2 signed = False if "float" in type_specifiers: if "short" in type_specifiers: self.tokens.error("Float cannot be short") if "long" in type_specifiers: self.tokens.error("Float cannot be long (but double can)") if "unsigned" in type_specifiers: self.tokens.error("Float cannot be unsigned") type_ = "float" size = 4 signed = True const = False if "const" in type_specifiers: const = True if "void" in type_specifiers: type_ = "void" size = 2 signed = False return type_, size, signed, const def parse_argument(self): type_, size, signed, const = self.parse_type_specifier() if type_ in ["void"]: self.tokens.error("argument cannot be void") else: argument = self.tokens.get() if type_ in self.structs: declaration = self.scope[type_] else: if self.tokens.peek() == "[": self.tokens.expect("[") self.tokens.expect("]") declaration = ArrayDeclaration( self.allocator, 2, type_+"[]", type_, size, signed, None, self.initialize_memory) else: declaration = VariableDeclaration( self.allocator, None, argument, type_, size, signed, const) instance = declaration.instance() return (argument, instance) #self.add_to_scope(argument,instance) #return instance.reference() def parse_function(self): type_, size, signed, const = self.parse_type_specifier() name = self.tokens.get() #check if it is a global declaration if self.tokens.peek() != "(": return self.parse_global_declaration(type_, size, signed, const, name) #otherwise continue parsing a function function_was_already_declared = False if name in self.scope: #something already has the same name function_was_already_declared = True function = self.scope[name] if not isinstance(function, Function): self.tokens.error("%s was already mentioned, but was not a function"%name) #check if return type matches if function.type_ != type_: self.tokens.error("return type of %s does not match the previously declared type; is %s, should be %s"%(name, type_, function.type_)) if function.size != size: self.tokens.error("size of return type of %s does not match the previously declared size; is %s, should be %s"%(name, size, function.size)) if function.signed != signed: self.tokens.error("signedness of return type of %s does not match the previously declared signedness; is %s, should be %s"%(name, signed, function.signed)) if function.const != const: self.tokens.error("constness of return type of %s does not match the previously declared constness; is %s, should be %s"%(name, const, function.const)) else: #first time this name is seen function = Function() function.allocator = self.allocator function.name = name function.type_ = type_ function.size = size function.signed = signed function.const = const function.return_address = self.allocator.new(2, function.name+" return address") if type_ != "void": if type_ in self.structs: declaration = self.scope[type_] else: if self.tokens.peek() == "[": self.tokens.error( "Functions cannot return arrays") #if functions are changed to allow returning arrays, then check here if it matches the forward declaration, if any. else: declaration = VariableDeclaration( self.allocator, None, function.name+" return value", type_, size, signed, const) function.return_value = declaration.instance().reference() self.tokens.expect("(") #arguments must be allocated the first time the function is mentioned. #because when other functions call this function, they need to use the allocated variables #so don't allocate argument variables any time except the first time #the next time you encounter this function, only check if the arguments are the correct type #Also, don't add the argument variables to the current scope unless you have an argument body, because the names can change. if not function_was_already_declared: function.arguments = [] function.argument_names = [] #Gets overwritten if the names are changed while self.tokens.peek() != ")": (arg_name, instance) = self.parse_argument() function.arguments.append(instance.reference()) function.argument_names.append(arg_name) if self.tokens.peek() == ",": self.tokens.expect(",") else: break else: #function was already declared #check if arg types match function.argument_names = [] #Gets overwritten if the names are changed for index, argumentVarRef in enumerate(function.arguments): if self.tokens.peek() != ")": #next section is ugly #a better way would be a function to compare 2 types for exact equality argumentInst = argumentVarRef.instance arg_type, arg_size, arg_signed, arg_const = self.parse_type_specifier() arg_name = self.tokens.get() #print "%s: type %s, size %s, signed %s, const %s"%(arg_name, arg_type, arg_size, arg_signed, arg_const) function.argument_names.append(arg_name) is_array = False if self.tokens.peek() == "[": self.tokens.expect("[") self.tokens.expect("]") is_array = True arg_type = arg_type + "[]" if arg_type != argumentInst.type_(): self.tokens.error("Function %s, argument %d, was previously declared to have type %s, but here, it is %s"%(name, index+1, argumentInst.type_(), arg_type)) if not is_array: if arg_size != argumentInst.size(): self.tokens.error("Function %s, argument %d, was previously declared to have size %s, but here, it is %s"%(name, index+1, argumentInst.size(), arg_size)) if arg_const != argumentInst.const(): self.tokens.error("Function %s, argument %d, was previously declared to have constness %s, but here, it is %s"%(name, index+1, argumentInst.const(), arg_const)) if arg_signed != argumentInst.signed(): self.tokens.error("Function %s, argument %d, was previously declared to have signedness %s, but here, it is %s"%(name, index+1, argumentInst.signed(), arg_signed)) else: if arg_size != argumentInst.element_size: self.tokens.error("Function %s, argument %d, was previously declared to have element size %s, but here, it is %s"%(name, index+1, argumentInst.element_size, arg_size)) if arg_signed != argumentInst.element_signed: self.tokens.error("Function %s, argument %d, was previously declared to have element signedness %s, but here, it is %s"%(name, index+1, argumentInst.element_signed, arg_signed)) #array element constness? if self.tokens.peek() == ",": self.tokens.expect(",") else: self.tokens.error("Function %s was previously declared to have %d arguments, but here, only %d are present"%(name, len(function.arguments), index)) if self.tokens.peek() != ")": self.tokens.error("Function %s was previously declared to have %d arguments, but here, more are present"%(name, len(function.arguments) )) #print function.arguments self.tokens.expect(")") if self.tokens.peek() == ";": self.tokens.expect(";") else: self.enter_scope() self.function = function #body attached - add the argument variables to the new scope for (arg_name, argumentVarRef) in zip(function.argument_names, function.arguments): self.add_to_scope(arg_name, argumentVarRef.instance) if function.statement is not None: self.tokens.error("A function body was already defined for %s, can't use another"%name) function.statement = self.parse_block() if type_ != "void" and not hasattr(function, "return_statement"): self.tokens.error("Function must have a return statement") self.function = None self.leave_scope() #now we are done parsing the function, restore the previous scope if not function_was_already_declared: self.add_to_scope(function.name,function) #main thread is last function self.main = function if function_was_already_declared: return None #The function object is returned upon the function's first mention, so here, return something that will not generate any code return function def parse_break(self): break_ = Break() break_.loop = self.loop self.tokens.expect("break") self.tokens.expect(";") return break_ def parse_continue(self): continue_ = Continue() continue_.loop = self.loop self.tokens.expect("continue") self.tokens.expect(";") return continue_ def parse_return(self): return_ = Return() return_.function = self.function return_.allocator = self.allocator self.function.return_statement = return_ self.tokens.expect("return") if hasattr(self.function, "return_value"): expression = self.parse_expression() if self.function.type_ == "int" and expression.type_() == "float": expression = FloatToInt(expression) elif self.function.type_ == "float" and expression.type_() == "int": expression = IntToFloat(expression) elif self.function.type_ != expression.type_(): self.tokens.error( "type mismatch in return statement expected: %s actual: %s"%( self.function.type_, expression.type_())) return_.expression = expression self.tokens.expect(";") return return_ def parse_assert(self): assert_ = Assert() assert_.allocator = self.allocator self.tokens.expect("assert") self.tokens.expect("(") assert_.expression = self.parse_expression() self.tokens.expect(")") self.tokens.expect(";") assert_.line = self.tokens.lineno assert_.filename = self.tokens.filename return assert_ def parse_report(self): report_ = Report() report_.allocator = self.allocator self.tokens.expect("report") self.tokens.expect("(") report_.expression = self.parse_expression() self.tokens.expect(")") self.tokens.expect(";") report_.line = self.tokens.lineno report_.filename = self.tokens.filename return report_ def parse_wait_clocks(self): wait_clocks = WaitClocks() wait_clocks.allocator = self.allocator self.tokens.expect("wait_clocks") self.tokens.expect("(") wait_clocks.expression = self.parse_expression() self.tokens.expect(")") self.tokens.expect(";") wait_clocks.line = self.tokens.lineno return wait_clocks def parse_statement(self): if self.tokens.peek() in numeric_types + self.structs + storage_specifiers: return self.parse_compound_declaration() elif self.tokens.peek() == "struct": return self.parse_struct_declaration() elif self.tokens.peek() == "if": return self.parse_if() elif self.tokens.peek() == "while": return self.parse_while() elif self.tokens.peek() == "for": return self.parse_for() elif self.tokens.peek() == "return": return self.parse_return() elif self.tokens.peek() == "break": return self.parse_break() elif self.tokens.peek() == "continue": return self.parse_continue() elif self.tokens.peek() == "{": return self.parse_block() elif self.tokens.peek() == "assert": return self.parse_assert() elif self.tokens.peek() == "report": return self.parse_report() elif self.tokens.peek() == "switch": return self.parse_switch() elif self.tokens.peek() == "case": return self.parse_case() elif self.tokens.peek() == "default": return self.parse_default() elif self.tokens.peek() == "wait_clocks": return self.parse_wait_clocks() elif self.tokens.peek() == "goto": return self.parse_goto(); elif self.tokens.peek(1) == ":": return self.parse_labeled_statement() else: expression = self.parse_discard() self.tokens.expect(";") return expression def parse_discard(self): return DiscardExpression(self.parse_expression(), self.allocator) def parse_labeled_statement(self): name = self.tokens.get() self.tokens.expect(":") label = Label(name, self.parse_statement() ) if name in self.function.labels_in_scope: self.tokens.error( "label %s was already declared in this function"%name) self.function.labels_in_scope[name] = label return label def parse_goto(self): self.tokens.expect("goto") name = self.tokens.get() self.tokens.expect(";") return Goto(name, self.function, self.tokens.filename, self.tokens.lineno) def parse_assignment(self): assignment_operators = [ "=", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=" ] lvalue = self.parse_ternary_expression() if self.tokens.peek() in assignment_operators: if lvalue.const(): self.tokens.error( "left hand operand of assignment is not modifiable") operator = self.tokens.get() if operator == "=": expression = self.parse_ternary_expression() else: expression = self.parse_ternary_expression() left = lvalue left, expression = self.coerce_types(left, expression) expression = Binary(operator[:-1], left, expression) if expression.type_() != lvalue.type_(): if expression.type_() == "int" and lvalue.type_() == "float": expression = IntToFloat(expression) elif expression.type_() == "float" and lvalue.type_() == "int": expression = FloatToInt(expression) else: self.tokens.error( "type mismatch in assignment expected: %s actual: %s"%( lvalue.type_(), expression.type_())) return Assignment(lvalue, expression, self.allocator) else: return lvalue def parse_if(self): if_ = If() if_.allocator = self.allocator self.tokens.expect("if") self.tokens.expect("(") if_.expression = self.parse_expression() if if_.expression.type_() not in ["unsigned", "int", "short", "long", "char"]: self.tokens.error( "if statement conditional must be an integer like expression") self.tokens.expect(")") if_.true_statement = self.parse_statement() if self.tokens.peek() == "else": self.tokens.expect("else") if_.false_statement = self.parse_statement() else: if_.false_statement = None return if_ def parse_switch(self): switch = Switch() switch.cases = {} self.tokens.expect("switch") self.tokens.expect("(") expression = self.parse_expression() if expression.type_() not in ["unsigned", "int", "short", "long", "char"]: self.tokens.error( "switch statement expression must be an integer like expression") self.tokens.expect(")") stored_loop = self.loop self.loop = switch statement = self.parse_statement() self.loop = stored_loop switch.expression = expression switch.allocator = self.allocator switch.statement = statement return switch def parse_case(self): self.tokens.expect("case") expression = self.parse_expression() if expression.type_() not in ["int"]: self.tokens.error( "case expression must be an integer like expression") self.tokens.expect(":") try: expression = expression.value() case = Case() self.loop.cases[expression] = case except NotConstant: self.tokens.error("case expression must be constant") except AttributeError: self.tokens.error( "case statements may only be use inside a switch statment") return case def parse_default(self): self.tokens.expect("default") self.tokens.expect(":") default = Default() if not hasattr(self.loop, "cases"): self.tokens.error( "default statements may only be used inside a switch statment") if hasattr(self.loop, "default"): self.tokens.error( "A switch statement may only have one default statement") self.loop.default=default return default def parse_while(self): loop = Loop() self.tokens.expect("while") self.tokens.expect("(") expression = self.parse_expression() self.tokens.expect(")") stored_loop = self.loop self.loop = loop statement = self.parse_statement() self.loop = stored_loop if_ = If() loop.statement = if_ break_ = Break() break_.loop = loop if_.allocator = self.allocator if expression.type_() not in ["int"]: self.tokens.error( "while statement conditional must be an integer like expression") if_.expression = expression if_.false_statement = break_ if_.true_statement = statement return loop def parse_for(self): for_ = For() for_.allocator = self.allocator self.tokens.expect("for") self.tokens.expect("(") if self.tokens.peek() != ";": for_.statement1 = self.parse_discard() self.tokens.expect(";") if self.tokens.peek() != ";": for_.expression = self.parse_expression() if for_.expression.type_() not in [ "unsigned", "int", "short", "long", "char"]: self.tokens.error( "For statement conditional must be an integer like expression") self.tokens.expect(";") if self.tokens.peek() != ")": for_.statement2 = self.parse_discard() self.tokens.expect(")") stored_loop = self.loop self.loop = for_ for_.statement3 = self.parse_statement() self.loop = stored_loop return for_ def parse_block(self): block = Block() self.enter_scope() self.tokens.expect("{") block.statements = [] while self.tokens.peek() != "}": block.statements.append(self.parse_statement()) self.tokens.expect("}") self.leave_scope() #now we are done parsing the block, restore the previous scope return block def parse_struct_body(self): self.tokens.expect("{") members = {} while self.tokens.peek() != "}": type_, size, signed, const = self.parse_type_specifier() name = self.tokens.get() members[name] = self.parse_declaration( type_, size, signed, const, name) self.tokens.expect(";") self.tokens.expect("}") return members def parse_typedef_struct(self): self.tokens.expect("typedef") self.tokens.expect("struct") declaration = StructDeclaration(self.parse_struct_body()) name = self.tokens.get() self.tokens.expect(";") self.add_to_scope(name,declaration) self.structs.append(name) def parse_define_struct(self): self.tokens.expect("struct") name = self.tokens.get() declaration = StructDeclaration(self.parse_struct_body()) self.tokens.expect(";") self.add_to_scope(name,declaration) def parse_struct_declaration(self): self.tokens.expect("struct") struct_name = self.tokens.get() name = self.tokens.get() self.tokens.expect(";") instance = self.scope[struct_name].instance() self.add_to_scope(name,instance) return instance def parse_global_declaration(self, type_, size, signed, const, name): instances = [] while True: instance = self.parse_declaration( type_, size, signed, const, name).instance() self.add_to_scope(name,instance) instances.append(instance) if self.tokens.peek() == ",": self.tokens.expect(",") else: break name = self.tokens.get() self.tokens.expect(";") return CompoundDeclaration(instances) def parse_compound_declaration(self): type_, size, signed, const = self.parse_type_specifier() instances = [] while True: name = self.tokens.get() instance = self.parse_declaration( type_, size, signed, const, name).instance() self.add_to_scope(name,instance) instances.append(instance) if self.tokens.peek() == ",": self.tokens.expect(",") else: break name = None self.tokens.expect(";") return CompoundDeclaration(instances) def parse_declaration(self, type_, size, signed, const, name): #struct declaration if type_ in self.structs: declaration = self.scope[type_] elif type_ in ["int", "float"]: #array declaration if self.tokens.peek() == "[": array_size = None self.tokens.expect("[") if self.tokens.peek() != "]": size_expression = self.parse_ternary_expression() if size_expression.type_() != "int": self.tokens.error("Array size must be an integer like expression") try: array_size = size_expression.value() except NotConstant: self.tokens.error("Array size must be constant") self.tokens.expect("]") initializer = None if self.tokens.peek() == "=": self.tokens.expect("=") initializer = self.tokens.get() initializer = [ord(i) for i in initializer.strip('"').decode("string_escape")] + [0] array_size = len(initializer) if array_size is None: self.tokens.error( "array size must be specified if not initialized") array_type=type_+"[]" initialize_memory = self.initialize_memory declaration = ArrayDeclaration( self.allocator, array_size, array_type, type_, size, signed, initializer, self.initialize_memory) #simple variable declaration else: if self.tokens.peek() == "=": self.tokens.expect("=") initializer = self.parse_ternary_expression() else: initializer = Constant(0, type_, size, signed) if type_ != initializer.type_(): if type_ == "int" and initializer.type_() == "float": initializer = FloatToInt(initializer) elif type_ == "float" and initializer.type_() == "int": initializer = IntToFloat(initializer) else: self.tokens.error( "type mismatch in intializer expected: %s actual: %s"%( type_, intitializer.type_())) declaration = VariableDeclaration( self.allocator, initializer, name, type_, size, signed, const ) return declaration def parse_expression(self): expression = self.parse_assignment() return expression def parse_ternary_expression(self): expression = constant_fold(self.parse_or_expression()) while self.tokens.peek() in ["?"]: self.tokens.expect("?") true_expression = constant_fold(self.parse_or_expression()) self.tokens.expect(":") false_expression = constant_fold(self.parse_or_expression()) expression = OR(AND(expression, true_expression), false_expression) return expression def parse_or_expression(self): expression = self.parse_and_expression() while self.tokens.peek() in ["||"]: self.tokens.expect("||") expression = OR(expression, self.parse_and_expression()) return expression def parse_and_expression(self): expression = self.parse_binary_expression(["|"]) while self.tokens.peek() in ["&&"]: self.tokens.expect("&&") expression = AND(expression, self.parse_binary_expression(["|"])) return expression def substitute_function(self, binary_expression): """ For some operations are more easily implemented in software. This function substitutes a call to the builtin library function. """ functions = { "False,int,int,4,/" : "long_unsigned_divide_xxxx", "True,int,int,4,/" : "long_divide_xxxx", "False,int,int,2,/" : "unsigned_divide_xxxx", "True,int,int,2,/" : "divide_xxxx", "False,int,int,4,%" : "long_unsigned_modulo_xxxx", "True,int,int,4,%" : "long_modulo_xxxx", "False,int,int,2,%" : "unsigned_modulo_xxxx", "True,int,int,2,%" : "modulo_xxxx", "True,float,float,4,==" : "float_equal_xxxx", "True,float,float,4,!=" : "float_ne_xxxx", "True,float,float,4,<" : "float_lt_xxxx", "True,float,float,4,>" : "float_gt_xxxx", "True,float,float,4,<=" : "float_le_xxxx", "True,float,float,4,>=" : "float_ge_xxxx", } #select a function that matches the template. signature = ",".join([ str(binary_expression.signed()), binary_expression.left.type_(), binary_expression.right.type_(), str(binary_expression.size()), binary_expression.operator]) #Some things can't be implemented in verilog, substitute them with a function if signature in functions: function = self.scope[functions[signature]] function_call = FunctionCall(function) function_call.arguments = [binary_expression.left, binary_expression.right] return function_call else: return binary_expression def coerce_types(self, left, right): """ Convert numeric types in expressions. """ if left.type_() != right.type_(): if left.type_() == "float" and right.type_() == "int": return left, IntToFloat(right) elif left.type_() == "int" and right.type_() == "float": return IntToFloat(left), right else: self.tokens.error("Incompatible types : %s %s"%( left.type_(), right.type_())) return left, right def parse_binary_expression(self, operators): operator_precedence = { "|": ["^"], "^": ["&"], "&": ["==", "!="], "==": ["<", ">", "<=", ">="], "<": ["<<", ">>"], "<<": ["+", "-"], "+": ["*", "/", "%"], } if operators[0] not in operator_precedence: left = self.parse_unary_expression() while self.tokens.peek() in operators: operator = self.tokens.get() right = self.parse_unary_expression() left, right = self.coerce_types(left, right) left = Binary(operator, left, right) left = self.substitute_function(left) return left else: next_operators = operator_precedence[operators[0]] left = self.parse_binary_expression(next_operators) while self.tokens.peek() in operators: operator = self.tokens.get() right = self.parse_binary_expression(next_operators) left, right = self.coerce_types(left, right) left = Binary(operator, left, right) left = self.substitute_function(left) return left def parse_unary_expression(self): if self.tokens.peek() == "!": operator = self.tokens.get() expression = self.parse_postfix_expression() if expression.type_() not in ["int"]: self.tokens.error( "! is only valid for integer like expressions") return Binary("==", expression, Constant(0)) elif self.tokens.peek() == "-": operator = self.tokens.get() expression = self.parse_postfix_expression() return Binary("-", Constant(0, expression.type_(), expression.size(), expression.signed()), expression) elif self.tokens.peek() == "~": operator = self.tokens.get() expression = self.parse_postfix_expression() if expression.type_() not in ["int"]: self.tokens.error( "~ is only valid for integer like expressions") return Unary("~", expression) elif self.tokens.peek() == "sizeof": operator = self.tokens.get() expression = self.parse_unary_expression() return SizeOf(expression) else: return self.parse_postfix_expression() def parse_postfix_expression(self): expression = self.parse_paren_expression() while self.tokens.peek() in ["++", "--"]: operator = self.tokens.get() expression = PostIncrement( operator[:-1], expression, self.allocator ) return expression def parse_paren_expression(self): if self.tokens.peek() == "(": self.tokens.expect("(") expression = self.parse_expression() self.tokens.expect(")") else: expression = self.parse_number_or_variable() return expression def parse_number_or_variable(self): if self.tokens.peek()[0].isalpha(): name = self.tokens.get() if self.tokens.peek() == "(": return self.parse_function_call(name) else: return self.parse_variable(name) else: return self.parse_number() def parse_file_read(self): self.tokens.expect("(") file_name = self.tokens.get() file_name = file_name.strip('"').decode("string_escape") self.tokens.expect(")") return FileRead(file_name) def parse_file_write(self): self.tokens.expect("(") expression = self.parse_expression() self.tokens.expect(",") file_name = self.tokens.get() file_name = file_name.strip('"').decode("string_escape") self.tokens.expect(")") return FileWrite(file_name, expression) def parse_input(self, name): input_name = name.replace("input_", "") self.tokens.expect("(") type_ = "int" if self.tokens.peek() != ")": type_ = self.tokens.get() type_ = type_.strip('"').decode("string_escape") if type_ not in numeric_types: self.tokens.error("%s is not a numeric type"%type_) self.tokens.expect(")") return Input(input_name, type_) def parse_ready(self, name): input_name = name.replace("ready_", "") self.tokens.expect("(") self.tokens.expect(")") return Ready(input_name) def parse_output(self, name): output_name = name.replace("output_", "") self.tokens.expect("(") expression = self.parse_expression() self.tokens.expect(")") return Output(output_name, expression) def parse_function_call(self, name): if name.startswith("input_"): return self.parse_input(name) if name.startswith("ready_"): return self.parse_ready(name) if name.startswith("output_"): return self.parse_output(name) if name == "file_read": return self.parse_file_read() if name == "file_write": return self.parse_file_write() if name not in self.scope: self.tokens.error("Unknown function: %s"%name) function = self.scope[name] function_call = FunctionCall(function) function_call.arguments = [] self.tokens.expect("(") while self.tokens.peek() != ")": function_call.arguments.append(self.parse_expression()) if self.tokens.peek() == ",": self.tokens.expect(",") else: break self.tokens.expect(")") required_arguments = len(function_call.function.arguments) actual_arguments = len(function_call.arguments) if required_arguments != actual_arguments: self.tokens.error("Function %s takes %s arguments, %s given."%( name, len(function_call.function.arguments), len(function_call.arguments))) required_arguments = function_call.function.arguments actual_arguments = function_call.arguments corrected_arguments = [] for required, actual in zip(required_arguments, actual_arguments): if not compatible(required, actual): if actual.type_() == "int" and required.type_() == "float": actual = IntToFloat(actual) elif actual.type_() == "float" and required.type_() == "int": actual = FloatToInt(actual) else: self.tokens.error( "type mismatch in assignment expected: %s, actual: %s"%( required.type_(), actual.type_())) corrected_arguments.append(actual) function_call.arguments = corrected_arguments return function_call def parse_number(self): token = self.tokens.get() type_ = "int" size = 2 signed = True if token.startswith("'"): try: token = eval(token) value = ord(token) except SyntaxError: self.tokens.error("%s is not a character literal"%token) elif token.startswith('"'): try: initializer = [ord(i) for i in token.strip('"').decode("string_escape")] + [0] size = len(initializer) initialize_memory = self.initialize_memory declaration = ArrayDeclaration( self.allocator, size, "int[]", "int", 2, False, initializer, self.initialize_memory) return ConstArray(declaration.instance()) except SyntaxError: self.tokens.error("%s is not a character literal"%token) elif "." in token: #float literal try: type_ = "float" signed = True size = 4 token = token.upper().replace("F", "") token = token.upper().replace("L", "") value = float(eval(token)) try: byte_value = struct.pack(">f", value) except OverflowError: self.tokens.error("value too large") except SyntaxError: self.tokens.error("%s is not a floating point literal"%token) else: #integer literal try: if "U" in token.upper(): signed = False if "L" in token.upper(): size = 4 token = token.upper().replace("U", "") value = int(eval(token)) if signed: if value > 2**((size * 8)-1) - 1: self.tokens.error("value too large") if value < -(2**((size * 8)-1)): self.tokens.error("value too small") else: if value > 2**(size * 8) - 1: self.tokens.error("value too large") if value < 0: self.tokens.error("value too small") except SyntaxError: self.tokens.error("%s is not an integer literal"%token) return Constant(value, type_, size, signed) def parse_variable(self, name): if name not in self.scope: self.tokens.error("Unknown variable: %s"%name) instance = self.scope[name] return self.parse_variable_array_struct(instance) def parse_variable_array_struct(self, instance): if instance.type_() in numeric_types: if not hasattr(instance, "reference"): self.tokens.error( "Not an expression") return Variable(instance) elif instance.type_().endswith("[]"): if self.tokens.peek() == "[": self.tokens.expect("[") index_expression = self.parse_expression() self.tokens.expect("]") if index_expression.type_() not in ["int"]: self.tokens.error( "Array indices must be an integer like expression") return ArrayIndex(instance, index_expression) else: return Array(instance) elif instance.type_().startswith("struct"): if self.tokens.peek() == ".": self.tokens.expect(".") member = self.tokens.get() instance = instance.members[member] return self.parse_variable_array_struct(instance) else: return Struct(instance)
class Parser: """Turn the C input file into a tree of expressions and statements.""" def __init__(self, input_file, reuse): self.scope = {} self.function = None self.loop = None self.tokens = Tokens(input_file) self.allocator = Allocator(reuse) self.structs = [] def parse_process(self): process = Process() process.allocator = self.allocator process.inputs = [] process.outputs = [] process.functions = [] while not self.tokens.end(): if self.tokens.peek() == "struct": self.parse_define_struct() elif self.tokens.peek() == "typedef": self.parse_typedef_struct() else: process.functions.append(self.parse_function()) process.main = self.main return process def parse_function(self): function = Function() function.allocator = self.allocator stored_scope = self.scope type_ = self.tokens.get() name = self.tokens.get() #check if it is a global declaration if self.tokens.peek() != "(": if type_ not in ["int", "short", "long", "char"] + self.structs: self.tokens.error("unknown type") return self.parse_global_declaration(type_, name) #otherwise continue parsing a function self.tokens.expect("(") function.name = name function.type_ = type_ function.return_address = self.allocator.new(function.name+" return address") if type_ not in ["int", "short", "long", "char", "void"]: self.tokens.error("unknown type") if type_ != "void": function.return_value = self.allocator.new(function.name+" return value") function.arguments = [] while self.tokens.peek() != ")": type_ = self.tokens.get() if type_ not in ["int", "short", "long", "char"]: self.tokens.error("unknown type") argument = self.tokens.get() if self.tokens.peek() == "[": self.tokens.expect("[") self.tokens.expect("]") type_+="[]" function.arguments.append(Argument(argument, type_, self)) if self.tokens.peek() == ",": self.tokens.expect(",") else: break self.tokens.expect(")") self.function = function function.statement = self.parse_statement() if type_ != "void" and not hasattr(function, "return_statement"): self.tokens.error("Function must have a return statement") self.function = None self.scope = stored_scope self.scope[function.name] = function #main thread is last function self.main = function return function def parse_break(self): break_ = Break() break_.loop = self.loop self.tokens.expect("break") self.tokens.expect(";") return break_ def parse_continue(self): continue_ = Continue() continue_.loop = self.loop self.tokens.expect("continue") self.tokens.expect(";") return continue_ def parse_return(self): return_ = Return() return_.function = self.function self.function.return_statement = return_ self.tokens.expect("return") if hasattr(self.function, "return_value"): return_.expression = self.parse_expression() self.tokens.expect(";") return return_ def parse_assert(self): assert_ = Assert() assert_.allocator = self.allocator self.tokens.expect("assert") self.tokens.expect("(") assert_.expression = self.parse_expression() self.tokens.expect(")") self.tokens.expect(";") assert_.line = self.tokens.lineno assert_.filename = self.tokens.filename return assert_ def parse_report(self): report_ = Report() report_.allocator = self.allocator self.tokens.expect("report") self.tokens.expect("(") report_.expression = self.parse_expression() self.tokens.expect(")") self.tokens.expect(";") report_.line = self.tokens.lineno report_.filename = self.tokens.filename return report_ def parse_wait_clocks(self): wait_clocks = WaitClocks() wait_clocks.allocator = self.allocator self.tokens.expect("wait_clocks") self.tokens.expect("(") wait_clocks.expression = self.parse_expression() self.tokens.expect(")") self.tokens.expect(";") wait_clocks.line = self.tokens.lineno return wait_clocks def parse_statement(self): if self.tokens.peek() in ["int", "short", "long", "char"] + self.structs: return self.parse_compound_declaration() elif self.tokens.peek() == "struct": return self.parse_struct_declaration() elif self.tokens.peek() == "if": return self.parse_if() elif self.tokens.peek() == "while": return self.parse_while() elif self.tokens.peek() == "for": return self.parse_for() elif self.tokens.peek() == "return": return self.parse_return() elif self.tokens.peek() == "break": return self.parse_break() elif self.tokens.peek() == "continue": return self.parse_continue() elif self.tokens.peek() == "{": return self.parse_block() elif self.tokens.peek() == "assert": return self.parse_assert() elif self.tokens.peek() == "report": return self.parse_report() elif self.tokens.peek() == "switch": return self.parse_switch() elif self.tokens.peek() == "case": return self.parse_case() elif self.tokens.peek() == "default": return self.parse_default() elif self.tokens.peek() == "wait_clocks": return self.parse_wait_clocks() else: expression = self.parse_discard() self.tokens.expect(";") return expression def parse_discard(self): return DiscardExpression(self.parse_expression(), self.allocator) def parse_assignment(self): assignment_operators = [ "=", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "<<=", ">>=", "++", "--" ] lvalue = self.parse_ternary_expression() if self.tokens.peek() in assignment_operators: if not hasattr(lvalue, "declaration"): self.tokens.error( "left hand operand of assignment is not modifiable" ) operator = self.tokens.get() if operator == "=": expression = self.parse_ternary_expression() elif operator in ["++", "--"]: expression = Binary( operator[:-1], lvalue, Constant(1), self.allocator ) else: expression = Binary( operator[:-1], lvalue, self.parse_ternary_expression(), self.allocator ) if lvalue.type_ != expression.type_: self.tokens.error( "type mismatch in assignment" ) return Assignment(lvalue, expression, self.allocator) else: return lvalue def parse_if(self): if_ = If() if_.allocator = self.allocator self.tokens.expect("if") self.tokens.expect("(") if_.expression = self.parse_expression() if if_.expression.type_ not in ["int", "short", "long", "char"]: self.tokens.error( "if statement conditional must be an integer like expression" ) self.tokens.expect(")") if_.true_statement = self.parse_statement() if self.tokens.peek() == "else": self.tokens.expect("else") if_.false_statement = self.parse_statement() else: if_.false_statement = None return if_ def parse_switch(self): switch = Switch() switch.cases = {} self.tokens.expect("switch") self.tokens.expect("(") expression = self.parse_expression() if expression.type_ not in ["int", "short", "long", "char"]: self.tokens.error( "switch statement expression must be an integer like expression" ) self.tokens.expect(")") stored_loop = self.loop self.loop = switch statement = self.parse_statement() self.loop = stored_loop switch.expression = expression switch.allocator = self.allocator switch.statement = statement return switch def parse_case(self): self.tokens.expect("case") expression = self.parse_expression() if expression.type_ not in ["int", "short", "long", "char"]: self.tokens.error( "case expression must be an integer like expression" ) self.tokens.expect(":") try: expression = value(expression) case = Case() self.loop.cases[expression] = case except NotConstant: self.tokens.error("case expression must be constant") except AttributeError: self.tokens.error( "case statements may only be use inside a switch statment" ) return case def parse_default(self): self.tokens.expect("default") self.tokens.expect(":") default = Default() if not hasattr(self.loop, "cases"): self.tokens.error( "default statements may only be used inside a switch statment" ) if hasattr(self.loop, "default"): self.tokens.error( "A switch statement may only have one default statement" ) self.loop.default=default return default def parse_while(self): loop = Loop() self.tokens.expect("while") self.tokens.expect("(") expression = self.parse_expression() self.tokens.expect(")") stored_loop = self.loop self.loop = loop statement = self.parse_statement() self.loop = stored_loop if_ = If() loop.statement = if_ break_ = Break() break_.loop = loop if_.allocator = self.allocator if expression.type_ not in ["int", "short", "long", "char"]: self.tokens.error( "if statement conditional must be an integer like expression" ) if_.expression = expression if_.false_statement = break_ if_.true_statement = statement return loop def parse_for(self): for_ = For() for_.allocator = self.allocator self.tokens.expect("for") self.tokens.expect("(") if self.tokens.peek() != ";": for_.statement1 = self.parse_discard() self.tokens.expect(";") if self.tokens.peek() != ";": for_.expression = self.parse_expression() if for_.expression.type_ not in ["int", "short", "long", "char"]: self.tokens.error( "for statement conditional must be an integer like expression" ) self.tokens.expect(";") if self.tokens.peek() != ")": for_.statement2 = self.parse_discard() self.tokens.expect(")") stored_loop = self.loop self.loop = for_ for_.statement3 = self.parse_statement() self.loop = stored_loop return for_ def parse_block(self): block = Block() stored_scope = self.scope self.tokens.expect("{") block.statements = [] while self.tokens.peek() != "}": block.statements.append(self.parse_statement()) self.tokens.expect("}") self.scope = stored_scope return block def parse_struct_body(self): self.tokens.expect("{") members = {} while self.tokens.peek() != "}": type_ = self.tokens.get() name = self.tokens.get() members[name] = self.parse_declaration(type_, name) self.tokens.expect(";") self.tokens.expect("}") return members def parse_typedef_struct(self): self.tokens.expect("typedef") self.tokens.expect("struct") declaration = StructDeclaration(self.parse_struct_body()) name = self.tokens.get() self.tokens.expect(";") self.scope[name] = declaration self.structs.append(name) def parse_define_struct(self): self.tokens.expect("struct") name = self.tokens.get() declaration = StructDeclaration(self.parse_struct_body()) self.tokens.expect(";") self.scope[name] = declaration def parse_struct_declaration(self): self.tokens.expect("struct") struct_name = self.tokens.get() name = self.tokens.get() self.tokens.expect(";") instance = self.scope[struct_name].instance() self.scope[name] = instance return instance def parse_global_declaration(self, type_, name): instances = [] while True: instance = self.parse_declaration(type_, name).instance() self.scope[name] = instance instances.append(instance) if self.tokens.peek() == ",": self.tokens.expect(",") else: break name = self.tokens.get() self.tokens.expect(";") return CompoundDeclaration(instances) def parse_compound_declaration(self): type_ = self.tokens.get() instances = [] while True: name = self.tokens.get() instance = self.parse_declaration(type_, name).instance() self.scope[name] = instance instances.append(instance) if self.tokens.peek() == ",": self.tokens.expect(",") else: break name = None self.tokens.expect(";") return CompoundDeclaration(instances) def parse_declaration(self, type_, name): #struct declaration if type_ in self.structs: declaration = self.scope[type_] elif type_ in ["int", "short", "long", "char"]: #array declaration if self.tokens.peek() == "[": self.tokens.expect("[") size = self.tokens.get() self.tokens.expect("]") type_+="[]" declaration = ArrayDeclaration(self.allocator, size, type_) #simple variable declaration else: if self.tokens.peek() == "=": self.tokens.expect("=") initializer = self.parse_ternary_expression() else: initializer = Constant(0) declaration = VariableDeclaration( self.allocator, initializer, name, type_ ) return declaration def parse_expression(self): expression = self.parse_assignment() return expression def parse_ternary_expression(self): expression = constant_fold(self.parse_or_expression()) while self.tokens.peek() in ["?"]: self.tokens.expect("?") true_expression = constant_fold(self.parse_or_expression()) self.tokens.expect(":") false_expression = constant_fold(self.parse_or_expression()) expression = OR(AND(expression, true_expression), false_expression) return expression def parse_or_expression(self): expression = self.parse_and_expression() while self.tokens.peek() in ["||"]: self.tokens.expect("||") expression = OR(expression, self.parse_and_expression()) return expression def parse_and_expression(self): expression = self.parse_binary_expression(["|"]) while self.tokens.peek() in ["&&"]: self.tokens.expect("&&") expression = AND(expression, self.parse_binary_expression(["|"])) return expression def parse_binary_expression(self, operators): operator_precedence = { "|": ["^"], "^": ["&"], "&": ["==", "!="], "==": ["<", ">", "<=", ">="], "<": ["<<", ">>"], "<<": ["+", "-"], "+": ["*", "/", "%"], } if operators[0] not in operator_precedence: expression = self.parse_unary_expression() while self.tokens.peek() in operators: expression = Binary( self.tokens.get(), expression, self.parse_unary_expression(), self.allocator ) return expression else: next_operators = operator_precedence[operators[0]] expression = self.parse_binary_expression(next_operators) while self.tokens.peek() in operators: expression = Binary( self.tokens.get(), expression, self.parse_binary_expression(next_operators), self.allocator ) return expression def parse_unary_expression(self): if self.tokens.peek() == "!": operator = self.tokens.get() expression = self.parse_paren_expression() return Binary("==", expression, Constant(0), self.allocator) elif self.tokens.peek() == "-": operator = self.tokens.get() expression = self.parse_paren_expression() return Binary("-", Constant(0), expression, self.allocator) elif self.tokens.peek() == "~": operator = self.tokens.get() expression = self.parse_paren_expression() return Unary("~", expression) else: return self.parse_paren_expression() def parse_paren_expression(self): if self.tokens.peek() == "(": self.tokens.expect("(") expression = self.parse_expression() self.tokens.expect(")") else: expression = self.parse_number_or_variable() return expression def parse_number_or_variable(self): if self.tokens.peek()[0].isalpha(): name = self.tokens.get() if self.tokens.peek() == "(": return self.parse_function_call(name) else: return self.parse_variable(name) else: return self.parse_number() def parse_input(self, name): input_name = name.replace("input_", "") self.tokens.expect("(") self.tokens.expect(")") return Input(input_name) def parse_ready(self, name): input_name = name.replace("ready_", "") self.tokens.expect("(") self.tokens.expect(")") return Ready(input_name) def parse_output(self, name): output_name = name.replace("output_", "") self.tokens.expect("(") expression = self.parse_expression() self.tokens.expect(")") return Output(output_name, expression) def parse_function_call(self, name): if name.startswith("input_"): return self.parse_input(name) if name.startswith("ready_"): return self.parse_ready(name) if name.startswith("output_"): return self.parse_output(name) function_call = FunctionCall() function_call.arguments = [] self.tokens.expect("(") while self.tokens.peek() != ")": function_call.arguments.append(self.parse_expression()) if self.tokens.peek() == ",": self.tokens.expect(",") else: break self.tokens.expect(")") if name not in self.scope: self.tokens.error("Unknown function: %s"%name) function_call.function = self.scope[name] function_call.type_ = function_call.function.type_ required_arguments = len(function_call.function.arguments) actual_arguments = len(function_call.arguments) if required_arguments != actual_arguments: self.tokens.error("Function %s takes %s arguments %s given."%( name, len(function_call.function.arguments), len(function_call.arguments) )) required_arguments = function_call.function.arguments actual_arguments = function_call.arguments for required, actual in zip(required_arguments, actual_arguments): if required.type_ != actual.type_: self.tokens.error("Type mismatch expected type : %s got: %s."%( required.type_, actual.type_ )) return function_call def parse_number(self): token = self.tokens.get() if token.startswith("'"): try: value = ord(eval(token)) except SyntaxError: self.tokens.error("%s is not a character literal"%token) else: try: value = int(eval(token)) except SyntaxError: self.tokens.error("%s is not an integer literal"%token) return Constant(value) def parse_variable(self, name): if name not in self.scope: self.tokens.error("Unknown variable: %s"%name) instance = self.scope[name] return self.parse_variable_array_struct(instance) def parse_variable_array_struct(self, instance): if instance.type_ in ["int", "short", "long", "char"]: return Variable(instance, self.allocator) elif instance.type_.endswith("[]"): if self.tokens.peek() == "[": self.tokens.expect("[") index_expression = self.parse_expression() self.tokens.expect("]") if index_expression.type_ not in ["int", "short", "long", "char"]: self.tokens.error( "array indices must be an integer like expression" ) return ArrayIndex(instance, index_expression, self.allocator) else: return Array(instance, self.allocator) elif instance.type_ == "struct": self.tokens.expect(".") member = self.tokens.get() instance = instance.members[member] return self.parse_variable_array_struct(instance)