Beispiel #1
0
 def __init__(self, input_file, reuse):
     self.scope = {}
     self.function = None
     self.loop = None
     self.tokens = Tokens(input_file)
     self.allocator = Allocator(reuse)
     self.structs = []
Beispiel #2
0
    def retrieval_cosine(self, query):
        scores = dict()
        tokens = Tokens()
        query_terms = tokens.edit_query(query)
        query_weights = dict(collections.Counter(query_terms).items())
        for query in query_weights:
            query_weights[query] = tf(query_weights[query])

        for query in query_weights:
            term = query
            doc_ids = self.L(term)
            for doc in doc_ids:
                if doc[0] not in scores:
                    scores[doc[0]] = tf(doc[1]) * tf(query_weights[term])
                else:
                    new_value = scores[doc[0]] + tf(doc[1]) * tf(
                        query_weights[term])
                    scores[doc[0]] = new_value
        for id in scores:
            norm_value = self.inverted_index.get_norms(str(id))
            scores[id] = round(
                scores[id] /
                (norm_value * self.get_query_norms(query_weights)), 2)
        scores = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
        return scores
Beispiel #3
0
    def set_index_dic(self, my_tweets_dic):
        for tweet_id in self.my_tweets_dic:
            tokens = Tokens()
            tokens.remove_stopwords(self.my_tweets_dic[tweet_id])
            current_id = tweet_id
            for word in tokens.reduced_tokens:
                self.replace_in_dic(word, current_id)

        sorted_dic = dict(sorted(self.dic.items()))
        return sorted_dic
Beispiel #4
0
    def __init__(self, feature_for_separate_model=False):
        super(FeatureGenerator, self).__init__()
        self.dictionary = enchant.Dict('en_US')
        self.token_generator = Tokens()     # Connection established!
        self.record = None
        self.tokens = []
        self.features = None      # list of list of features for every name; e.g. [[1,1,1,1],[...], ...]

        
        # Regex setup
        self.NUM_REGEX = re.compile('\d')
        self.CHAR_DIGIT_MIX_REGEX = re.compile('((^[a-zA-Z]+\d{4}$)|(^[a-zA-Z]+\d{2}$))|((^\d{4}[a-zA-Z]+$)|(^\d{2}[a-zA-Z]+$))', re.MULTILINE)
        self.NAME_ABBREV_REGEX = re.compile('([A-Z]\.-[A-Z]\.)|([A-Z]\.-[A-Z])|([A-Z]\.-)|(([A-Z]\.)+)|(O\'[A-Z][a-z]+)')   #C.P.; C.-C.; O'Reilly
        self.PAGE_NO_REGEX = re.compile('\d+-\d+')

        # Gazzatte setup
        self.DELIMITERS = [',', '.', ]
        self.LBRACKET = ['(', '[', '{', '<', ]
        self.RBRACKET = [')', ']', '}', '>', ]
        self.APOSTROPHES = ["'s", "'re", "'d", ]
        self.QUOTATIONS = ['"', "''", "``", ]
        self.MONTHS = ['Janurary', 'February', 'March', 'April','May','June','July','August','September','October','November','December']
        self.NAME_LIST = [item.strip() for item in open('data/name.lst','r').readlines()]
        self.VENUE_LIST = [item.strip() for item in open('data/venue.lst','r').readlines()]
        self.ORDINAL_LIST = [item.strip() for item in open('data/ordinal.lst','r').readlines()]
        # self.CITY_LIST = [item.strip() for item in open('data/cities.lst','r').readlines()]
        self.COUNTRY_LIST = [item.strip() for item in open('data/countries.lst','r').readlines()]

        if feature_for_separate_model:
            self.pipeline = PARTIAL_PIPELINE
        else:
            self.pipeline = STANDARD_PIPELINE
Beispiel #5
0
	def remove_cattributes(self,bot,update,args):
		try:
			with closing(self.conn.cursor()) as cur:
				uid = update.message.from_user.id
				try:
					cattribute_to_remove = str.lower(args[0])
				except IndexError:
					reply_message = "Can't remove an empty string! Please use the following format: /rmcattributes cattributename" 
					update.message.reply_text(reply_message,parse_mode='HTML')
				else:
					cur.execute("""SELECT * FROM Attribute 
									WHERE telegram_id = %s 
									AND LOWER(attribute_name) = LOWER(%s)
								""",(uid,cattribute_to_remove,))
					if cur.rowcount == 0:
						reply_message = "Can't find that cattribute! please use /listcattributes to show your cattributes"
						update.message.reply_text(reply_message,parse_mode='HTML')
					else:
						cur.execute("""DELETE FROM Attribute 
										WHERE telegram_id = %s
										AND LOWER (attribute_name) = LOWER(%s)
										""",(uid,cattribute_to_remove,))
						reply_message = "".join([str.lower(cattribute_to_remove), " has been sucessfully removed"])
						update.message.reply_text(reply_message,parse_mode='HTML')


		except Exception as e:
			catcherror = traceback.format_exc()
			bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
Beispiel #6
0
def process_multiple(log, do_fetch=True, do_parse=True, do_merge=True):
    root = config["data-dir"]

    if do_fetch:
        tokens = Tokens()
        api = API(tokens, log)
        util.delete_files(root + '/processing/invoices', '*.json')
        success, invoice_cnt = api.fetch_invoice_details(hours_delta=30,
                                                         tz_offset=7)
        if success and invoice_cnt > 0:
            log.write(
                "INFO api invoices extraction succeeded {:,} invoices saved to : {}"
                .format(invoice_cnt, '/processing/invoices'))
        elif success and invoice_cnt == 0:
            log.write(
                "INFO api no invoices extracted (no new/updated invoices in refresh period)"
            )
            return True
        else:
            log.write(
                "ERROR api invoices extraction failed {:,} invoices saved to : {}"
                .format(invoice_cnt, '/processing/invoices'))
            return False

    if do_parse:
        util.delete_files(root + '/processing/invoices', '*.csv')
        parser = Parser(log)
        parser.parse('invoices-line-items')

    if do_merge:
        merger = Merger(log)
        merger.merge_invoice_delta()

    return True
Beispiel #7
0
def process_single(log, do_fetch=True, do_parse=True):
    root = config["data-dir"]

    if do_fetch:
        tokens = Tokens()
        api = API(tokens, log)
        util.delete_files(root + '/processing/default', '*.json')
        if not api.fetch_data("items"):
            return False
        if not api.fetch_data("branding-themes"):
            return False
        if not api.fetch_data("contacts"):
            return False
        if not api.fetch_data("invoices"):
            return False

    if do_parse:
        util.delete_files(root + '/processing/default', '*.csv')
        parser = Parser(log)
        parser.parse('branding-themes')
        parser.parse('items')
        parser.parse('contacts')
        parser.parse('invoices')

    return True
Beispiel #8
0
	def add_cattributes(self,bot,update,args):
		try:
			with closing(self.conn.cursor()) as cur:
				uid = update.message.from_user.id
				try:
					cattribute_to_add = str.lower(args[0])
				except IndexError:
					reply_message = "Can't add an empty string! Please use the following format: /addcattributes cattributename" 
					update.message.reply_text(reply_message,parse_mode='HTML')
				else:
					cur.execute("""SELECT * FROM Attribute 
									WHERE telegram_id = %s 
									AND LOWER(attribute_name) = LOWER(%s)
								""",(uid,cattribute_to_add,))
					if cur.rowcount > 0:
						reply_message = "This cattribute is already added!"
						update.message.reply_text(reply_message,parse_mode='HTML')
					else:
						cur.execute("""INSERT INTO Attribute VALUES(%s,%s)""",(uid,cattribute_to_add))
						reply_message = "".join([str.lower(cattribute_to_add)," has been added to the table"])
						update.message.reply_text(reply_message,parse_mode='HTML')


		except Exception as e:
			catcherror = traceback.format_exc()
			bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
Beispiel #9
0
    def __init__(self, record):
        super(AuthorFeatureBuilder, self).__init__()
        self.record = record
        self.tokens = Tokens(record).tokens
        self.num_tokens = len(self.tokens)
        self.features = None  # list of list of features for every name; e.g. [[1,1,1,1],[...], ...]

        self.NUM_REGEX = re.compile('\d')
        self.DELIMITERS = [
            ',',
            '.',
            ';',
        ]
        self.NAME_LIST = [
            item.strip() for item in open('data/name.lst', 'r').readlines()
        ]

        self.pipeline = [
            'f_is_capitalized',
            'f_is_all_upper',
            'f_is_english',
            'f_is_punctuation',
            'f_is_sequential_punctuation',
            'f_has_digit',
            'f_is_all_digit',
            'f_is_in_namelist',
            'f_is_fname_abbrev',
            'f_is_preceeded_by_delimiter',
            'f_is_followed_by_delimiter',
            'f_is_an_and_between_two_names',
        ]

        self.build()
Beispiel #10
0
    def decode_without_constraints(self, segment):
        print segment
        observation_sequence, decoded_sequence = self.HMMentire.decode_without_constraints(
            segment)

        self.observations_raw.append(segment)
        self.observation_sequences.append(observation_sequence)
        self.labels.append(decoded_sequence)

        for vector, decoding, token in zip(observation_sequence,
                                           decoded_sequence,
                                           Tokens(segment).tokens):
            if decoding == 0:
                label = 'FN'
            elif decoding == 1:
                label = 'LN'
            elif decoding == 2:
                label = 'DL'
            elif decoding == 3:
                label = 'TI'
            elif decoding == 4:
                label = 'VN'
            elif decoding == 5:
                label = 'YR'
            else:
                label = str(decoding) + ', PROBLEM'
            print vector, '\t', label, '\t', token
        print '\n\n'
Beispiel #11
0
def main(wordCorpus):
    min_df = 2
    tokenType = 'stopped'
    if (wordCorpus == 'twenty-news'):
        groupIndices = [
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19
        ]
    elif (wordCorpus == 'acl-imdb'):
        groupIndices = [0, 1]
    nClusters = len(groupIndices)
    for groupIndex in groupIndices:
        tokensLists, className = Tokens(wordCorpus).getTokens(
            tokenType, groupIndex)
        flat_list = [
            tokens for tokensList in tokensLists for tokens in tokensList
        ]
        text = ' '.join(flat_list)
        wordcloud = WordCloud(max_font_size=40,
                              width=600,
                              height=400,
                              background_color='white',
                              max_words=200,
                              relative_scaling=1.0).generate_from_text(text)
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        wordcloud.to_file('./results/' + className + '.jpg')
Beispiel #12
0
    def decode(self, segment):
        # print segment
        observation_sequence, decoded_sequence = self.HMMentire.decode(segment)

        self.observations_raw.append(segment)
        self.observation_sequences.append(observation_sequence)
        self.labels.append(decoded_sequence)

        # segment the labeling into parts
        author_field = []
        title_field = []
        venue_field = []
        year_field = []
        raw_tokens = Tokens(segment).tokens
        for i in range(len(decoded_sequence)):
            token_i = raw_tokens[i]
            label_i = decoded_sequence[i]
            if label_i in [0, 1]:
                author_field.append(token_i)
            if label_i == 2:
                continue
            if label_i == 3:
                title_field.append(token_i)
            if label_i == 4:
                venue_field.append(token_i)
            if label_i == 5:
                year_field.append(token_i)

        return ' '.join(author_field), ' '.join(title_field), ' '.join(
            venue_field), list(set(year_field))
Beispiel #13
0
 def __init__(self, input_file, reuse):
     self.scope = {}
     self.function = None
     self.loop = None
     self.tokens = Tokens(input_file)
     self.allocator = Allocator(reuse)
     self.structs = []
Beispiel #14
0
    def translate(self, readline, result=None, no_imports=None):
        # Tracker to keep track of information as the file is processed
        self.tokens = Tokens(self.default_kls)
        self.tracker = Tracker(result, self.tokens, self.wrapped_setup)

        # Add import stuff at the top of the file
        if self.import_tokens and no_imports is not True:
            self.tracker.add_tokens(self.import_tokens)

        # Looking at all the tokens
        with self.tracker.add_phase() as tracker:
            for tokenum, value, (_, scol), _, _ in generate_tokens(readline):
                self.tracker.next_token(tokenum, value, scol)

        # Add attributes to our Describes so that the plugin can handle some nesting issues
        # Where we have tests in upper level describes being run in lower level describes
        if self.with_describe_attrs:
            self.tracker.add_tokens(self.tracker.make_describe_attrs())

        # If setups should be wrapped, then do this at the bottom
        if self.wrapped_setup:
            self.tracker.add_tokens(self.tracker.wrapped_setups())

        # Add lines to bottom of file to add __testname__ attributes
        self.tracker.add_tokens(self.tracker.make_method_names())

        # Return translated list of tokens
        return self.tracker.result
Beispiel #15
0
    def find_venue_boundary_tokens(self):
        recorder = {}
        for raw_segment, observation_sequence, label_sequence in zip(
                self.raw_segments, self.observation_sequences,
                self.label_sequences):
            first_target_label_flag = True
            tokens = Tokens(raw_segment).tokens
            for token, feature_vector, label in zip(tokens,
                                                    observation_sequence,
                                                    label_sequence):
                # First meet a VN label
                if label == 4 and first_target_label_flag:
                    key = token.lower()
                    if not key.islower():
                        continue
                    if recorder.has_key(key):
                        recorder[key] += 1
                    else:
                        recorder[key] = 1
                    first_target_label_flag = False

                elif (first_target_label_flag is False) and label in [0, 1, 3]:
                    first_target_label_flag = True

        for k, v in recorder.iteritems():
            print k, '\t', v
        return recorder
Beispiel #16
0
def Cryptokitties():
    print("Cryptokitties online")
    updater = Updater(token=Tokens.bot_token("live"))
    dispatcher = updater.dispatcher
    j = updater.job_queue
    job_minute = j.run_repeating(Commands.broadcast, 150, 0)
    job_minute = j.run_repeating(Commands.kleongbroadcast, 150, 0)
    updater.start_polling()
    updater.idle
Beispiel #17
0
    def __init__(self):

        Token = Tokens()

        self.reddit = praw.Reddit(client_id=Token['client_id'],
                                  client_secret=Token['client_secret'],
                                  user_agent='my-user-agent',
                                  username=Token['username'],
                                  password=Token['password'])
    def get_prev_token(self):
        self.pos -= 2

        tok = Tokens(self.code[self.pos] + " " + self.code[self.pos+1])

        if tok.type == TokensType.INVALID:
            raise InterpreterException(f"Invalid token at position : {self.pos}")

        return tok
Beispiel #19
0
 def __init__(self, input_file, reuse, initialize_memory):
 
     self.symbols_defined_in_current_scope = {}
     self.symbols_defined_stack = []
     self.scope = {}
     self.scope_stack = []
     
     self.function = None
     self.loop = None
     self.tokens = Tokens(input_file)
     self.allocator = Allocator(reuse)
     self.structs = []
     self.initialize_memory = initialize_memory
def getX(wordCorpus, tokenType, groupIndices):
    XAll = []
    indexList = {}
    start = 0
    for groupIndex in groupIndices:
        X, className = Tokens(wordCorpus).getTokens(tokenType, groupIndex)
        end = start + len(X)
        indexList[className] = {'start': start, 'end': end}
        XAll = XAll + X
        start = end
    XAll = np.array([np.array(xi)
                     for xi in XAll])  #   rows: Docs. columns: words
    return XAll, indexList
Beispiel #21
0
	def cancel(self,bot,update):
		try:
			with closing(self.conn.cursor()) as cur:
				uid = update.message.from_user.id
				message = "And here I was, thinking we could be friends :("
				update.message.reply_text(message,parse_mode='HTML')
				cur.execute("""DELETE FROM User WHERE telegram_id = %s""",(uid,))
				cur.execute("""DELETE FROM Attribute WHERE telegram_id = %s""",(uid,))
				return ConversationHandler.END


		except Exception as e:
			catcherror = traceback.format_exc()
			bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
Beispiel #22
0
def token_interface():
    if request.method == 'POST':
        phone_number = request.form['phone_number']
        dept_id = request.form['departments']
        stream_id = request.form['streams']
        token_day_number = _get_current_token()
        attending = 1
        token = Tokens(token_day_number=token_day_number, phone_number=phone_number,
                        department=dept_id, stream=stream_id)
        db.session.add(token)
        db.session.commit()
        dept_name = Departments.query.filter_by(id=dept_id).first().name
        stream_name = Streams.query.filter_by(id=stream_id).first().name
        return render_template('generated_token.html', dept_name=dept_name, 
            token_number=token_day_number, attending=attending, token=token, stream_name=stream_name)
    departments = Departments.query.all()
    streams = Streams.query.all()
    return render_template('token_interface.html', departments= departments, streams=streams)
Beispiel #23
0
 def run(self):
     i = 0
     self.new_labels = []
     for raw_segment, label_sequence in zip(self.raw_segments,
                                            self.label_sequences):
         new_labels = self.hmm_new.decode(raw_segment)[1]
         self.new_labels.append(new_labels)
         tokens = Tokens(raw_segment).tokens
         feature_vectors = FeatureGenerator(raw_segment).features
         print i, ':  ', raw_segment
         for token, old_label, new_label, feature_vector in zip(
                 tokens, label_sequence, new_labels, feature_vectors):
             print to_label(old_label), '\t', to_label(
                 new_label), '\t', token
             self.feature_entity_list.add_entity(
                 feature_vector, old_label, token)  #???? Old label first
         print '\n'
         i += 1
Beispiel #24
0
	def forget(self,bot,update):
		try:
			with closing(self.conn.cursor()) as cur:
				uid = update.message.from_user.id
				cur.execute("""SELECT * FROM User WHERE telegram_id = %s""",(uid,))
				if cur.rowcount == 0:
					message = "Can't delete what doesn't exist, man"
					update.message.reply_text(message,parse_mode='HTML')
				else:
					cur.execute("""DELETE FROM User WHERE telegram_id = %s""",(uid,))
					cur.execute("""DELETE FROM Attribute WHERE telegram_id = %s""",(uid,))
					message = "Oh, I'll tell you all about it when I see you again"
					update.message.reply_text(message,parse_mode='HTML')


		except Exception as e:
			catcherror = traceback.format_exc()
			bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
Beispiel #25
0
	def list_cattributes(self,bot,update):
		try:
			with closing(self.conn.cursor()) as cur:
				uid = update.message.from_user.id
				cur.execute("""SELECT * FROM Attribute WHERE telegram_id = %s""",(uid,))
				if cur.rowcount == 0:
					message = "You have no cattributes listed!"
					update.message.reply_text(message,parse_mode='HTML')
				else:
					cattributes = cur.fetchall()
					catlist = [x[1] for x in cattributes]
					message = "".join(['Your current cattributes are: ',(", ".join(catlist))])
					update.message.reply_text(message,parse_mode='HTML')


		except Exception as e:
			catcherror = traceback.format_exc()
			bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
def getX(wordCorpus, tokenType, listOfClasses):
    XAll = []
    indexList = {}
    groupIndices = listOfClasses.split(',')
    start = 0
    for groupIndex in groupIndices:
        X, className = Tokens(wordCorpus).getTokens(tokenType, groupIndex)
        end = start + len(X)
        indexList[className] = {'start': start, 'end': end}
        logger.info('True Group Index {}, classname: {}'.format(
            groupIndex, className))
        logger.info('Count {}, start - End Indices  {} , {}'.format(
            len(X), start, end))
        XAll = XAll + X
        start = end
    XAll = np.array([np.array(xi)
                     for xi in XAll])  #   rows: Docs. columns: words
    logger.info('indexList{}'.format(indexList))
    return XAll, indexList
Beispiel #27
0
def Cryptokitties():
	print("Cryptokitties online")
	updater = Updater(token=Tokens().bot_token())
	dispatcher = updater.dispatcher
	# registering for users to a database.
	conv_handler = ConversationHandler(
		entry_points=[CommandHandler('register', Commands().register)],

		states={
			GENERATION:[MessageHandler(Filters.text,Commands().generation)],
			COOLDOWN:[MessageHandler(Filters.text,Commands().cooldown)],
			OFFSTART:[MessageHandler(Filters.text,Commands().offstart)],
			OFFEND:[MessageHandler(Filters.text,Commands().offend)],
			ATTLIST: [MessageHandler(Filters.text,Commands().attribute_list)]
		},

		fallbacks=[CommandHandler('cancel', Commands().cancel)],
		per_user = '******'
	)
	dispatcher.add_handler(conv_handler,1)

	forget_handler = CommandHandler('forget', Commands().forget)
	dispatcher.add_handler(forget_handler)

	alert_handler = CommandHandler('alert',Commands().alert)
	dispatcher.add_handler(alert_handler)

	list_handler = CommandHandler('listcattributes',Commands().list_cattributes)
	dispatcher.add_handler(list_handler)

	remove_cattribute = CommandHandler('rmcattributes',Commands().remove_cattributes, pass_args=True)
	dispatcher.add_handler(remove_cattribute)

	add_cattribute = CommandHandler('addcattributes',Commands().add_cattributes, pass_args=True)
	dispatcher.add_handler(add_cattribute)
	########################################################
	#				Alert jobs
	########################################################
	j = updater.job_queue
	job_minute = j.run_repeating(Commands().user_broadcast,600,0)
	updater.start_polling()
	updater.idle
Beispiel #28
0
def refresh_tokens(tokens: Tokens) -> Tokens:
    logging.debug("Try to refresh tokens")
    data = {
        "grant_type": "refresh_token",
        "refresh_token": tokens.refresh_token
    }

    headers = make_auth_http_headers()

    r = requests.post(url=URI_HH_OAUTH_TOKEN, headers=headers, data=data)
    logging.info(r.text)
    if r.status_code == 200:
        js = r.json()
        r = Tokens(access_token=js["access_token"],
                   refresh_token=js["refresh_token"])
        logging.debug("Tokens: %s", r)
        return r
    else:
        logging.error("status_code: %s; response: %s", r.status_code, r.text)
        return tokens
Beispiel #29
0
	def attribute_list(self,bot,update):
		try:
			with closing(self.conn.cursor()) as cur:
				uid = update.message.from_user.id
				if str.lower(update.message.text) == "end":
					message = "Thanks for registering =)\n"
					message = "If you want to toggle 10 minute scans, please do a /alert"
					update.message.reply_text(message,parse_mode='HTML')
					return ConversationHandler.END
				else:
					cur.execute("""INSERT INTO Attribute VALUES(%s,%s)""",(uid,update.message.text,))
					message= update.message.text
					message += " has been added as an attribute. Please enter the next attribute \n"
					message += "If you're done with adding your cattributes, please reply with end"
					update.message.reply_text(message,parse_mode='HTML')
					return ATTLIST

		except Exception as e:
			catcherror = traceback.format_exc()
			bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
Beispiel #30
0
	def offend(self,bot,update):
		try:
			with closing(self.conn.cursor()) as cur:
				uid = update.message.from_user.id
				try:
					int(update.message.text)
				except ValueError:
					message = "Please send me an integer :)"
					update.message.reply_text(message,parse_mode='HTML')
					return OFFEND
				else:
					cur.execute("""UPDATE User SET offset_end = %s WHERE telegram_id =  %s""",(update.message.text,uid,))
					message = "Thank you, now, please key in a cattribute(one cattribute at a time only!)"
					message += "This bot will match the cattributes you are looking for"
					update.message.reply_text(message,parse_mode='HTML')
					return ATTLIST

		except Exception as e:
			catcherror = traceback.format_exc()
			bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
Beispiel #31
0
	def cooldown(self,bot,update):
		try:
			with closing(self.conn.cursor()) as cur:
				uid = update.message.from_user.id
				try:
					int(update.message.text)
				except ValueError:
					message = "Please send me an integer :)"
					update.message.reply_text(message,parse_mode='HTML')
					return COOLDOWN
				else:
					cur.execute("""UPDATE User SET cooldown_index = %s WHERE telegram_id =  %s""",(update.message.text,uid,))
					message = "Fantastic. Now, may I please have the offset starting point?"
					message += "This bot will scan the api starting at the offset given. We recomend starting at 0"
					update.message.reply_text(message,parse_mode='HTML')
					return OFFSTART


		except Exception as e:
			catcherror = traceback.format_exc()
			bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
Beispiel #32
0
	def generation(self,bot,update):
		try:
			with closing(self.conn.cursor()) as cur:
				uid = update.message.from_user.id
				try:
					int(update.message.text)
				except ValueError:
					message = "Please send me an integer :)"
					update.message.reply_text(message,parse_mode='HTML')
					return GENERATION
				else:
					cur.execute("""UPDATE User SET generation_index = %s WHERE telegram_id =  %s""",(update.message.text,uid,))
					message = "Fantastic. Now, may I please have a cooldown index?"
					message += "This bot will scan for the cooldown index less than the number that you input"
					update.message.reply_text(message,parse_mode='HTML')
					return COOLDOWN


		except Exception as e:
			catcherror = traceback.format_exc()
			bot.sendMessage(chat_id=Tokens().error_channel(),text=catcherror,parse_mode='HTML')
Beispiel #33
0
    def __init__(self, record):
        super(VenueFeatureBuilder, self).__init__()
        self.record = record
        self.tokens = Tokens(record).tokens
        self.num_tokens = len(self.tokens)
        self.features = None

        self.NUM_REGEX = re.compile('\d')
        self.CHAR_DIGIT_MIX_REGEX = re.compile(
            '((^[a-zA-Z]+\d{4}$)|(^[a-zA-Z]+\d{2}$))|((^\d{4}[a-zA-Z]+$)|(^\d{2}[a-zA-Z]+$))',
            re.MULTILINE)

        self.DELIMITERS = [
            ',',
            '.',
            ';',
        ]
        self.VENUE_LIST = [
            item.strip() for item in open('data/venue.lst', 'r').readlines()
        ]
        self.ORDINAL_LIST = [
            item.strip() for item in open('data/ordinal.lst', 'r').readlines()
        ]

        self.pipeline = [
            'f_is_capitalized',
            'f_is_all_upper',
            'f_is_english',
            'f_has_both_char_and_digit',
            'f_is_ordinal',
            'f_is_punctuation',
            'f_has_digit',
            'f_is_all_digit',
            'f_is_in_venuelist',
            'f_is_preceeded_by_delimiter',
            'f_is_followed_by_delimiter',
            'f_is_followed_by_year',
        ]

        self.build()
Beispiel #34
0
class FeatureGenerator(object):
    """
        @param:
            record -> piece of raw_text, or a list of tokens
    """
    def __init__(self, feature_for_separate_model=False):
        super(FeatureGenerator, self).__init__()
        self.dictionary = enchant.Dict('en_US')
        self.token_generator = Tokens()     # Connection established!
        self.record = None
        self.tokens = []
        self.features = None      # list of list of features for every name; e.g. [[1,1,1,1],[...], ...]

        
        # Regex setup
        self.NUM_REGEX = re.compile('\d')
        self.CHAR_DIGIT_MIX_REGEX = re.compile('((^[a-zA-Z]+\d{4}$)|(^[a-zA-Z]+\d{2}$))|((^\d{4}[a-zA-Z]+$)|(^\d{2}[a-zA-Z]+$))', re.MULTILINE)
        self.NAME_ABBREV_REGEX = re.compile('([A-Z]\.-[A-Z]\.)|([A-Z]\.-[A-Z])|([A-Z]\.-)|(([A-Z]\.)+)|(O\'[A-Z][a-z]+)')   #C.P.; C.-C.; O'Reilly
        self.PAGE_NO_REGEX = re.compile('\d+-\d+')

        # Gazzatte setup
        self.DELIMITERS = [',', '.', ]
        self.LBRACKET = ['(', '[', '{', '<', ]
        self.RBRACKET = [')', ']', '}', '>', ]
        self.APOSTROPHES = ["'s", "'re", "'d", ]
        self.QUOTATIONS = ['"', "''", "``", ]
        self.MONTHS = ['Janurary', 'February', 'March', 'April','May','June','July','August','September','October','November','December']
        self.NAME_LIST = [item.strip() for item in open('data/name.lst','r').readlines()]
        self.VENUE_LIST = [item.strip() for item in open('data/venue.lst','r').readlines()]
        self.ORDINAL_LIST = [item.strip() for item in open('data/ordinal.lst','r').readlines()]
        # self.CITY_LIST = [item.strip() for item in open('data/cities.lst','r').readlines()]
        self.COUNTRY_LIST = [item.strip() for item in open('data/countries.lst','r').readlines()]

        if feature_for_separate_model:
            self.pipeline = PARTIAL_PIPELINE
        else:
            self.pipeline = STANDARD_PIPELINE

    def close_connection(self):
        self.token_generator.close_connection()


    def build(self, record):
        self.record = record

        features = []
        need_tokenize = True
        if type(self.record) is list:
            need_tokenize = False
        else:
            need_tokenize = True

        # record raw texts
        if need_tokenize:
            response_obj = self.token_generator.tokenize(self.record)
            self.tokens = response_obj['tokens']

        # Already tokenized input
        else:   
            self.tokens = self.record

        self.num_tokens = len(self.tokens)  # count how many tokens are there in this piece of text.

        for i in range(self.num_tokens):
            sub_features = []
            for pipe in self.pipeline:
                action = getattr(self, pipe)
                sub_features.append(action(i))
            features.append(sub_features)
        self.features = features

        return features

    def token_length(self, record):
        return self.token_generator.token_length(record)


    def print_features(self):
        for i in range(self.num_tokens):
            print self.features[i], '\t\t', self.tokens[i]


    ################################### Feature functions ###################################
    # Feature output format:
    # [
    #   [([1,0,0,1], 1), ([1,1,1,1], 0), (...)...], <-- One piece of training sample (x, y) where x=x1x2x3...xm, y=y1y2y3...ym <-- a sentence representation in feature vectors, in sequence
    #   [.......................],  <-- another sentence, parallel with the previous sentence, independent processed
    #   ...
    # ]
    # Assume segment is space-delimited, so it's a feature for the segmentm challenge will be tokenizing
    ################################### Local Features #####################################

    # C.B. or C.-C 
    def f_is_name_abbrev(self, idx):
        token = self.tokens[idx]
        if self.NAME_ABBREV_REGEX.match(token) is None:
            return 0
        return 1

    def f_is_apostrophes(self, idx):
        token = self.tokens[idx]
        return int(token in self.APOSTROPHES)

    def f_is_capitalized(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token[0].isupper())

    def f_is_all_upper(self, idx):
        token = self.tokens[idx]
        if len(token) <= 2:
            return 0
        return int(token.isupper())

    def f_is_english(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(self.dictionary.check(token.lower()) and len(token) > 1)

    def f_has_both_char_and_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if self.CHAR_DIGIT_MIX_REGEX.search(token) is None:
            return 0
        return 1

    def f_is_delimiter(self, idx):
        token = self.tokens[idx]
        if len(token) != 1:
            return 0
        return int(token in self.DELIMITERS)

    def f_is_quotation(self, idx):
        token = self.tokens[idx]
        return int(token in self.QUOTATIONS)

    def f_is_punctuation(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(len(token) == 1 and token in punctuation)

    # def f_is_sequential_punctuation(self, idx): #e.g. K.C.-C. Chang
    #     token = self.tokens[idx]
    #     if len(token) <= 1:
    #         return 0
    #     ret = 1
    #     for t in token:
    #         if t not in punctuation:
    #             ret = 0
    #             break
    #     return ret

    def f_has_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if self.NUM_REGEX.search(token) is None:
            return 0
        return 1

    def f_is_all_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.isdigit())

    def f_is_possible_page_number(self, idx):
        token = self.tokens[idx]
        if self.PAGE_NO_REGEX.match(token) is None:
            return 0
        return 1

    def f_is_month(self, idx):
        token = self.tokens[idx]
        return int(token in self.MONTHS)

    def f_is_possible_year(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.isdigit() and len(token)==4 and int(token)>= 1980 and int(token)<=datetime.now().year)




    ################################### Dictionary Features ################################
    def f_is_in_namelist(self, idx):
        token = self.tokens[idx].encode('ascii', 'ignore')
        if len(token) == 0:
            return 0
        return int(token.lower().strip() in self.NAME_LIST)

    def f_is_ordinal(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.lower().strip() in self.ORDINAL_LIST)


    # Also handled some of the common venue tokens that are also common in English???? 
    # TODO: more delicate 
    def f_is_in_venuelist(self, idx):
        token = self.tokens[idx].encode('ascii', 'ignore')
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            prev_token = ''

        # Special case handling
        if token.strip() in ['In', 'Appear', 'Appears', 'Appeared', ] and len(prev_token)>0 and prev_token in ['.', ',', ';', '(', ]:
            return 1

        return int(token.lower().strip() in (self.VENUE_LIST + self.ORDINAL_LIST + self.COUNTRY_LIST) )


    ################################### Global Features ####################################

    def f_has_lbracket_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int( prev_token in self.LBRACKET )

    def f_has_rbracket_after(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int( next_token in self.RBRACKET )

    def f_has_quotation_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int( prev_token in self.QUOTATIONS )

    def f_has_quotation_after(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int( next_token in self.QUOTATIONS )


    #????
    def f_is_possible_volume(self, idx):
        token = self.tokens[idx]
        if ((idx-1) >=0) and ((idx+1)<self.num_tokens):
            prev_token = self.tokens[idx-1]
            next_token = self.tokens[idx+1]
            return int(prev_token in self.LBRACKET and next_token in self.RBRACKET and token.isdigit())
        else:
            return 0

    # ???? necessary?
    def f_is_at_second_half_of_string(self, idx):
        token = self.tokens[idx]
        return int(idx > self.num_tokens/2)

    def f_has_delimiter_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int(len(prev_token)==1 and prev_token in self.DELIMITERS)

    def f_has_delimiter_after(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int( len(next_token)==1 and next_token in self.DELIMITERS)

    #????
    def f_is_an_and_between_two_names(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens and (idx-1)>=0:
            next_token = self.tokens[idx+1]
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int(token.strip().lower()=='and' and self.f_is_capitalized(idx-1) and (self.f_is_english(idx-1)==0))

    def f_is_followed_by_year(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int((len(next_token)==2 or len(next_token)==4) and next_token.isdigit() and not token.isdigit())

    # Addressing the possible new notions in the title of publications
    def f_is_possible_new_notion(self, idx):
        token = self.tokens[idx]
        if (idx+2) < self.num_tokens:
            next_token = self.tokens[idx+1]
            next_next_token = self.tokens[idx+2]
        else:
            return 0
        p1 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+$', re.MULTILINE)
        p2 = re.compile(r'^[A-Z][a-z0-9]+$', re.MULTILINE)
        p3 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+[A-Z][a-z0-9]+$', re.MULTILINE)
        p4 = re.compile(r'^[a-z0-9]+$', re.MULTILINE)
        p5 = re.compile(r'[A-Z]*[A-Za-z]+-[A-Za-z]+')    #specific terminology ???? content-aware; Group-By
        # Xxxxxx, XxxxxXxxxx, XxxxXxxxXxxx, Xxxx xxxx, Xxxx Xxxx, XXXX

        pattern_1 = token.isupper() and next_token==':'
        pattern_2 = (p1.match(token) is not None) and next_token==':'
        pattern_3 = (p2.match(token) is not None) and next_token==':'
        pattern_4 = (p3.match(token) is not None) and next_token==':'
        pattern_5 = (p2.match(token) is not None) and (p2.match(next_token) is not None) and next_next_token==':'
        pattern_6 = (p2.match(token) is not None) and (p4.match(next_token) is not None) and next_next_token==':'
        pattern_7 = p5.match(token) is not None

        return int(pattern_1 or pattern_2 or pattern_3 or pattern_4 or pattern_5 or pattern_6 or pattern_7)


    def f_is_possible_boundary(self, idx):  #check if period.  Pending feature
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens and (idx-1)>=0:
            next_token = self.tokens[idx+1]
            prev_token = self.tokens[idx-1]
        else:
            return 0

        return int( (token == '.' and prev_token.islower() and next_token[0].isupper()) or 
                    (token[-1]=='.' and token[0].islower() and next_token[0].isupper()) 
                  )
def get_training_samples(url):
    log_err('\tGetting Training sample')
    raw_results = router(url)
    log_err('\tData retrieved. Preprocessing...')
    observation_list = []
    label_list = []
    records = []

    feature_generator = FeatureGenerator()
    token_generator = Tokens()

    for raw_result in raw_results:
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []

        authors = raw_result['authors']
        title = raw_result['title']        
        title_copy = raw_result['title']

        try:
            venue = raw_result['conference name']
            venue_copy = raw_result['conference name']
        except:
            venue = ''
            venue_copy = ''
        try:
            venue = raw_result['journal name']
            venue_copy = raw_result['journal name']
        except:
            venue = ''
            venue_copy = ''

        if len(venue) > 0:
            try:
                volume = raw_result['volume']
            except:
                volume = ''
            try:
                issue = raw_result['issue']
            except:
                issue = ''
            try:
                page = raw_result['page']
            except:
                page = ''

            venue += ' ' + volume + ' ' + issue + ' ' + page
            venue_copy += ' ' + volume + ' ' + issue + ' ' + page


        date = raw_result['publication date'][:4]

        # FN: 0
        # LN: 1
        # DL: 2
        # TI: 3
        # VN: 4
        # DT: 5

        # Author -> Title -> ...
        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)
            tmp_label_list += [1,2]
                
        # title
        title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)

        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        #=================================Variations of authors=================================
        # Changing order, inserting dot, and probably insert comma as delimiter inside of names
        # This part of variations is very sensitive to what sample source to choose from,
        # for example, Google scholar is the current source of samples, and on gscholar, 
        # most names are in format of JW Han.  <-- Prior knowledge
        # Read more Learn more Change the Globe !!!
        log_err('\tGenerating multiple cases for name variations... ')
        # ================================A. B
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # authors
        for author in authors:
            if len(author) == 0:
                continue

            #???? BUG!!!! split() doesn't mean tokenization
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order tokens
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order to
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B, 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # ================================B A., 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B A.
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))






        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        # Period Case!!!
        log_err('\tGenerating multiple cases for period as DL... ')
        # Author -> Title -> ...
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # title
        title = title_copy + ' . '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            venue = venue_copy + ' . '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================A. B
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # authors
        for author in authors:
            if len(author) == 0:
                continue

            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order tokens
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order to
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B, 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # ================================B A., 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

       
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B A.
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))





    # =============================================================================Verbose: Print the training set
    for record, observation, label in zip(records, observation_list, label_list):
        for rr, oo, ll in zip(token_generator.tokenize(record)['tokens'], observation, label):
            if ll == 0:
                ll = 'FN'
            elif ll == 1:
                ll = 'LN'
            elif ll == 2:
                ll = 'DL'
            elif ll == 3:
                ll = 'TI'
            elif ll == 4:
                ll = 'VN'
            elif ll == 5:
                ll = 'DT'
            print oo, '\t', ll.encode('utf-8'), '\t', rr.encode('utf-8')
        print '\n\n'

    return observation_list, label_list
Beispiel #36
0
    def to_str(self, rels):
        s = ""
        for r in rels:
            if r.get("s", False):
                s = s + "{0} ({1}) - ".format(r["s"]["word"], 
                                              r["s"]["tag"])
            s = s + "{0} ({1}) - {2} ({3}) | ".format(r["v"]["word"],
                                                      r["v"]["tag"],
                                                      r["p"]["word"],
                                                      r["p"]["tag"])
        return s



if __name__ == "__main__":
    from tokens import Tokens
    from pos_tags import PosTags
    import sys
    fn = sys.argv[1]

    t = Tokens()
    pos_tags = PosTags()
    relations = Relations()

    with open(fn) as f:
        for l in f:
            tokens = t.tokenize(l)
            pos = pos_tags.tag(tokens)
            rels = relations.find(pos)
            print(relations.to_str(rels))
Beispiel #37
0
import os
import requests
import json
import datetime
from tokens import Tokens

# Environment variables must be set with your tokens
tokens = Tokens()
USER_TOKEN_STRING =  tokens.get_user_token()

class User:

    def __init__(self, user_id):
        # The Slack ID of the user
        self.id = user_id

        # The username (@username) and real name
        self.username, self.real_name = self.fetchNames()

        # A list of all exercises done by user
        self.exercise_history = []

        # A record of all exercise totals (quantity)
        self.exercises = {}

        # A record of exercise counts (# of times)
        self.exercise_counts = {}

        # A record of past runs
        self.past_workouts = {}
Beispiel #38
0
class Parser:

    """Turn the C input file into a tree of expressions and statements."""

    def __init__(self, input_file, reuse, initialize_memory):
    
        self.symbols_defined_in_current_scope = {}
        self.symbols_defined_stack = []
        self.scope = {}
        self.scope_stack = []
        
        self.function = None
        self.loop = None
        self.tokens = Tokens(input_file)
        self.allocator = Allocator(reuse)
        self.structs = []
        self.initialize_memory = initialize_memory
        
    def add_to_scope(self, name, obj):
        #if the name has already been defined in the current scope, error out.
        if name in self.symbols_defined_in_current_scope:
            self.tokens.error("%s is already defined"%name)
        self.symbols_defined_in_current_scope[name]=obj
        self.scope[name]=obj
        
    def enter_scope(self):
        self.symbols_defined_stack.append(self.symbols_defined_in_current_scope) #stack holds everything that has been defined in a underlying scope
        self.scope_stack.append(copy(self.scope))
        self.symbols_defined_in_current_scope = {}
    
    def leave_scope(self):
        self.symbols_defined_in_current_scope = self.symbols_defined_stack.pop()
        self.scope = self.scope_stack.pop()

    def parse_process(self):
        process = Process()
        process.allocator = self.allocator
        process.inputs = []
        process.outputs = []
        process.functions = []
        while not self.tokens.end():
            if self.tokens.peek() == "struct":
                self.parse_define_struct()
            elif self.tokens.peek() == "typedef":
                self.parse_typedef_struct()
            else:
                function = self.parse_function()
                if function is not None:
                    process.functions.append(function)
        process.main = self.main
        return process

    def parse_type_specifier(self):
        type_specifiers = []

        while self.tokens.peek() in types + self.structs + storage_specifiers:
            type_specifiers.append(self.tokens.get())

        signed = True
        if "unsigned" in type_specifiers:
            signed = False
            if "signed" in type_specifiers:
                self.tokens.error("Cannot be signed and unsigned")

        size = 2
        if "long" in type_specifiers:
            if "short" in type_specifiers:
                self.tokens.error("Cannot be long and short")
            size = 4

        type_ = "int"
        for i in type_specifiers:
            if i in self.structs:
                type_ = i
                size = 2
                signed = False

        if "float" in type_specifiers:
            if "short" in type_specifiers:
                self.tokens.error("Float cannot be short")
            if "long" in type_specifiers:
                self.tokens.error("Float cannot be long (but double can)")
            if "unsigned" in type_specifiers:
                self.tokens.error("Float cannot be unsigned")
            type_ = "float"
            size = 4
            signed = True

        const = False 
        if "const" in type_specifiers:
            const = True

        if "void" in type_specifiers:
            type_ = "void"
            size = 2
            signed = False


        return type_, size, signed, const

    def parse_argument(self):
        type_, size, signed, const = self.parse_type_specifier()

        if type_ in ["void"]:
            self.tokens.error("argument cannot be void")
        else:
            argument = self.tokens.get()
            if type_ in self.structs:
                declaration = self.scope[type_]
            else:
                if self.tokens.peek() == "[":
                    self.tokens.expect("[")
                    self.tokens.expect("]")
                    declaration = ArrayDeclaration(
                        self.allocator,
                        2,
                        type_+"[]",
                        type_,
                        size,
                        signed, 
                        None,
                        self.initialize_memory)
                else:
                    declaration = VariableDeclaration(
                        self.allocator,
                        None,
                        argument,
                        type_,
                        size,
                        signed,
                        const)
        instance = declaration.instance()
        return (argument, instance)
        #self.add_to_scope(argument,instance)
        #return instance.reference()

    def parse_function(self):

        
        type_, size, signed, const = self.parse_type_specifier()
        name = self.tokens.get()

        #check if it is a global declaration
        if self.tokens.peek() != "(":
            return self.parse_global_declaration(type_, size, signed, const, name)

        #otherwise continue parsing a function
        
        function_was_already_declared = False
        
        if name in self.scope:
            #something already has the same name
            function_was_already_declared = True
            function = self.scope[name]
            if not isinstance(function, Function):
                self.tokens.error("%s was already mentioned, but was not a function"%name)
            #check if return type matches
            if function.type_ != type_:
                self.tokens.error("return type of %s does not match the previously declared type; is %s, should be %s"%(name, type_, function.type_))
            if function.size != size:
                self.tokens.error("size of return type of %s does not match the previously declared size; is %s, should be %s"%(name, size, function.size))
            if function.signed != signed:
                self.tokens.error("signedness of return type of %s does not match the previously declared signedness; is %s, should be %s"%(name, signed, function.signed))
            if function.const != const:
                self.tokens.error("constness of return type of %s does not match the previously declared constness; is %s, should be %s"%(name, const, function.const))
        else:
            #first time this name is seen

        
            function = Function()
            function.allocator = self.allocator
        
            function.name = name
            function.type_ = type_
            function.size = size
            function.signed = signed
            function.const = const

            function.return_address = self.allocator.new(2, 
                function.name+" return address")

        if type_ != "void":

            if type_ in self.structs:
                declaration = self.scope[type_]
            else:
                if self.tokens.peek() == "[":
                    self.tokens.error(
                        "Functions cannot return arrays")
                    #if functions are changed to allow returning arrays, then check here if it matches the forward declaration, if any.
                else:
                    declaration = VariableDeclaration(
                        self.allocator,
                        None,
                        function.name+" return value",
                        type_,
                        size,
                        signed,
                        const)

            function.return_value = declaration.instance().reference()
        
        
        self.tokens.expect("(")

        #arguments must be allocated the first time the function is mentioned.
        #because when other functions call this function, they need to use the allocated variables
        #so don't allocate argument variables any time except the first time
        #the next time you encounter this function, only check if the arguments are the correct type
        
        #Also, don't add the argument variables to the current scope unless you have an argument body, because the names can change.
        
        if not function_was_already_declared:
            function.arguments = []
            function.argument_names = [] #Gets overwritten if the names are changed
            while self.tokens.peek() != ")":
                (arg_name, instance) = self.parse_argument()
                function.arguments.append(instance.reference())
                function.argument_names.append(arg_name)
                if self.tokens.peek() == ",":
                    self.tokens.expect(",")
                else:
                    break
        else:
            #function was already declared
            #check if arg types match
            function.argument_names = [] #Gets overwritten if the names are changed
            for index, argumentVarRef in enumerate(function.arguments):
                if self.tokens.peek() != ")":
                    #next section is ugly
                    #a better way would be a function to compare 2 types for exact equality
                    argumentInst = argumentVarRef.instance
                    arg_type, arg_size, arg_signed, arg_const = self.parse_type_specifier()
                    arg_name = self.tokens.get()
                    #print "%s: type %s, size %s, signed %s, const %s"%(arg_name, arg_type, arg_size, arg_signed, arg_const)

                    function.argument_names.append(arg_name)
                    is_array = False
                    if self.tokens.peek() == "[":
                        self.tokens.expect("[")
                        self.tokens.expect("]")
                        is_array = True
                        arg_type = arg_type + "[]"
                    if arg_type != argumentInst.type_():
                        self.tokens.error("Function %s, argument %d, was previously declared to have type %s, but here, it is %s"%(name, index+1, argumentInst.type_(), arg_type))
                    if not is_array:
                        if arg_size != argumentInst.size():
                            self.tokens.error("Function %s, argument %d, was previously declared to have size %s, but here, it is %s"%(name, index+1, argumentInst.size(), arg_size))
                        if arg_const != argumentInst.const():
                            self.tokens.error("Function %s, argument %d, was previously declared to have constness %s, but here, it is %s"%(name, index+1, argumentInst.const(), arg_const))
                        if arg_signed != argumentInst.signed():
                            self.tokens.error("Function %s, argument %d, was previously declared to have signedness %s, but here, it is %s"%(name, index+1, argumentInst.signed(), arg_signed))
                    else:
                        if arg_size != argumentInst.element_size:
                            self.tokens.error("Function %s, argument %d, was previously declared to have element size %s, but here, it is %s"%(name, index+1, argumentInst.element_size, arg_size))
                        if arg_signed != argumentInst.element_signed:
                            self.tokens.error("Function %s, argument %d, was previously declared to have element signedness %s, but here, it is %s"%(name, index+1, argumentInst.element_signed, arg_signed))
                        #array element constness?
                    if self.tokens.peek() == ",":
                        self.tokens.expect(",")
                else:
                    self.tokens.error("Function %s was previously declared to have %d arguments, but here, only %d are present"%(name, len(function.arguments),  index))
            
            if self.tokens.peek() != ")":
                self.tokens.error("Function %s was previously declared to have %d arguments, but here, more are present"%(name, len(function.arguments) ))
        
        #print function.arguments
        
        self.tokens.expect(")")
        
        
        if self.tokens.peek() == ";":
            self.tokens.expect(";")
        else:
            self.enter_scope()
            self.function = function
            #body attached - add the argument variables to the new scope
            for (arg_name, argumentVarRef) in zip(function.argument_names, function.arguments):
                self.add_to_scope(arg_name, argumentVarRef.instance)
            
            if function.statement is not None:
                self.tokens.error("A function body was already defined for %s, can't use another"%name)
            function.statement = self.parse_block()
            if type_ != "void" and not hasattr(function, "return_statement"):
                self.tokens.error("Function must have a return statement")
            self.function = None
            self.leave_scope() #now we are done parsing the function, restore the previous scope
            
        if not function_was_already_declared:
            self.add_to_scope(function.name,function)
            
        #main thread is last function
        self.main = function
        
        if function_was_already_declared:
            return None #The function object is returned upon the function's first mention, so here, return something that will not generate any code
        return function

    def parse_break(self):
        break_ = Break()
        break_.loop = self.loop
        self.tokens.expect("break")
        self.tokens.expect(";")
        return break_

    def parse_continue(self):
        continue_ = Continue()
        continue_.loop = self.loop
        self.tokens.expect("continue")
        self.tokens.expect(";")
        return continue_

    def parse_return(self):
        return_ = Return()
        return_.function = self.function
        return_.allocator = self.allocator
        self.function.return_statement = return_
        self.tokens.expect("return")

        if hasattr(self.function, "return_value"):
            expression = self.parse_expression()
            if self.function.type_ == "int" and expression.type_() == "float":
                expression = FloatToInt(expression)
            elif self.function.type_ == "float" and expression.type_() == "int":
                expression = IntToFloat(expression)
            elif self.function.type_ != expression.type_():
                self.tokens.error(
                    "type mismatch in return statement expected: %s actual: %s"%(
                        self.function.type_,
                        expression.type_()))
            return_.expression = expression

        self.tokens.expect(";")
        return return_

    def parse_assert(self):
        assert_ = Assert()
        assert_.allocator = self.allocator
        self.tokens.expect("assert")
        self.tokens.expect("(")
        assert_.expression = self.parse_expression()
        self.tokens.expect(")")
        self.tokens.expect(";")
        assert_.line = self.tokens.lineno
        assert_.filename = self.tokens.filename
        return assert_

    def parse_report(self):
        report_ = Report()
        report_.allocator = self.allocator
        self.tokens.expect("report")
        self.tokens.expect("(")
        report_.expression = self.parse_expression()
        self.tokens.expect(")")
        self.tokens.expect(";")
        report_.line = self.tokens.lineno
        report_.filename = self.tokens.filename
        return report_

    def parse_wait_clocks(self):
        wait_clocks = WaitClocks()
        wait_clocks.allocator = self.allocator
        self.tokens.expect("wait_clocks")
        self.tokens.expect("(")
        wait_clocks.expression = self.parse_expression()
        self.tokens.expect(")")
        self.tokens.expect(";")
        wait_clocks.line = self.tokens.lineno
        return wait_clocks

    def parse_statement(self):
        if self.tokens.peek() in numeric_types + self.structs + storage_specifiers:
            return self.parse_compound_declaration()
        elif self.tokens.peek() == "struct":
            return self.parse_struct_declaration()
        elif self.tokens.peek() == "if":
            return self.parse_if()
        elif self.tokens.peek() == "while":
            return self.parse_while()
        elif self.tokens.peek() == "for":
            return self.parse_for()
        elif self.tokens.peek() == "return":
            return self.parse_return()
        elif self.tokens.peek() == "break":
            return self.parse_break()
        elif self.tokens.peek() == "continue":
            return self.parse_continue()
        elif self.tokens.peek() == "{":
            return self.parse_block()
        elif self.tokens.peek() == "assert":
            return self.parse_assert()
        elif self.tokens.peek() == "report":
            return self.parse_report()
        elif self.tokens.peek() == "switch":
            return self.parse_switch()
        elif self.tokens.peek() == "case":
            return self.parse_case()
        elif self.tokens.peek() == "default":
            return self.parse_default()
        elif self.tokens.peek() == "wait_clocks":
            return self.parse_wait_clocks()
        elif self.tokens.peek() == "goto":
            return self.parse_goto();
        elif self.tokens.peek(1) == ":":
            return self.parse_labeled_statement()
        else:
            expression = self.parse_discard()
            self.tokens.expect(";")
            return expression

    def parse_discard(self):
        return DiscardExpression(self.parse_expression(), self.allocator)

    def parse_labeled_statement(self):
        name = self.tokens.get()
        self.tokens.expect(":")
        label = Label(name, self.parse_statement() )
        if name in self.function.labels_in_scope:
            self.tokens.error(
                    "label %s was already declared in this function"%name)
        self.function.labels_in_scope[name] = label
        return label

    def parse_goto(self):
        self.tokens.expect("goto")
        name = self.tokens.get()
        self.tokens.expect(";")

        return Goto(name, self.function, self.tokens.filename, self.tokens.lineno)

    def parse_assignment(self):
        assignment_operators = [
            "=", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>="
        ]
        lvalue = self.parse_ternary_expression()
        if self.tokens.peek() in assignment_operators:
            if lvalue.const():

                self.tokens.error(
                    "left hand operand of assignment is not modifiable")

            operator = self.tokens.get()
            if operator == "=":
                expression = self.parse_ternary_expression()
            else:

                expression = self.parse_ternary_expression()
                left = lvalue
                left, expression = self.coerce_types(left, expression)
                expression = Binary(operator[:-1], left, expression)

            if expression.type_() != lvalue.type_():
                if expression.type_() == "int" and lvalue.type_() == "float":
                    expression = IntToFloat(expression)
                elif expression.type_() == "float" and lvalue.type_() == "int":
                    expression = FloatToInt(expression)
                else:
                    self.tokens.error(
                        "type mismatch in assignment expected: %s actual: %s"%(
                            lvalue.type_(),
                            expression.type_()))

            return Assignment(lvalue, expression, self.allocator)
        else:
            return lvalue

    def parse_if(self):
        if_ = If()
        if_.allocator = self.allocator
        self.tokens.expect("if")
        self.tokens.expect("(")
        if_.expression = self.parse_expression()
        if if_.expression.type_() not in ["unsigned", "int", "short", "long", "char"]:
            self.tokens.error(
                "if statement conditional must be an integer like expression")
        self.tokens.expect(")")
        if_.true_statement = self.parse_statement()
        if self.tokens.peek() == "else":
            self.tokens.expect("else")
            if_.false_statement = self.parse_statement()
        else:
            if_.false_statement = None
        return if_

    def parse_switch(self):
        switch = Switch()
        switch.cases = {}
        self.tokens.expect("switch")
        self.tokens.expect("(")
        expression = self.parse_expression()
        if expression.type_() not in ["unsigned", "int", "short", "long", "char"]:
            self.tokens.error(
                "switch statement expression must be an integer like expression")
        self.tokens.expect(")")
        stored_loop = self.loop
        self.loop = switch
        statement = self.parse_statement()
        self.loop = stored_loop
        switch.expression = expression
        switch.allocator = self.allocator
        switch.statement = statement
        return switch

    def parse_case(self):
        self.tokens.expect("case")
        expression = self.parse_expression()
        if expression.type_() not in ["int"]:
            self.tokens.error(
                "case expression must be an integer like expression")
        self.tokens.expect(":")
        try:
            expression = expression.value()
            case = Case()
            self.loop.cases[expression] = case
        except NotConstant:
            self.tokens.error("case expression must be constant")
        except AttributeError:
            self.tokens.error(
                "case statements may only be use inside a switch statment")
        return case

    def parse_default(self):
        self.tokens.expect("default")
        self.tokens.expect(":")
        default = Default()
        if not hasattr(self.loop, "cases"):
            self.tokens.error(
                "default statements may only be used inside a switch statment")
        if hasattr(self.loop, "default"):
            self.tokens.error(
                "A switch statement may only have one default statement")
        self.loop.default=default
        return default

    def parse_while(self):
        loop = Loop()
        self.tokens.expect("while")
        self.tokens.expect("(")
        expression = self.parse_expression()
        self.tokens.expect(")")
        stored_loop = self.loop
        self.loop = loop
        statement = self.parse_statement()
        self.loop = stored_loop

        if_ = If()
        loop.statement = if_
        break_ = Break()
        break_.loop = loop
        if_.allocator = self.allocator
        if expression.type_() not in ["int"]:
            self.tokens.error(
                "while statement conditional must be an integer like expression")
        if_.expression = expression
        if_.false_statement = break_
        if_.true_statement = statement

        return loop

    def parse_for(self):
        for_ = For()
        for_.allocator = self.allocator
        self.tokens.expect("for")
        self.tokens.expect("(")
        if self.tokens.peek() != ";":
            for_.statement1 = self.parse_discard()
        self.tokens.expect(";")
        if self.tokens.peek() != ";":
            for_.expression = self.parse_expression()
            if for_.expression.type_() not in [
                "unsigned", 
                "int", 
                "short", 
                "long", 
                "char"]:

                self.tokens.error(
            "For statement conditional must be an integer like expression")

        self.tokens.expect(";")
        if self.tokens.peek() != ")":
            for_.statement2 = self.parse_discard()
        self.tokens.expect(")")
        stored_loop = self.loop
        self.loop = for_
        for_.statement3 = self.parse_statement()
        self.loop = stored_loop
        return for_

    def parse_block(self):
        block = Block()
        self.enter_scope()
        self.tokens.expect("{")
        block.statements = []
        while self.tokens.peek() != "}":
            block.statements.append(self.parse_statement())
        self.tokens.expect("}")
        self.leave_scope() #now we are done parsing the block, restore the previous scope
        return block

    def parse_struct_body(self):
        self.tokens.expect("{")
        members = {}
        while self.tokens.peek() != "}":
            type_, size, signed, const = self.parse_type_specifier()
            name = self.tokens.get()

            members[name] = self.parse_declaration(
                type_, 
                size, 
                signed, 
                const,
                name)

            self.tokens.expect(";")
        self.tokens.expect("}")
        return members

    def parse_typedef_struct(self):
        self.tokens.expect("typedef")
        self.tokens.expect("struct")
        declaration = StructDeclaration(self.parse_struct_body())
        name = self.tokens.get()
        self.tokens.expect(";")
        self.add_to_scope(name,declaration)
        self.structs.append(name)

    def parse_define_struct(self):
        self.tokens.expect("struct")
        name = self.tokens.get()
        declaration = StructDeclaration(self.parse_struct_body())
        self.tokens.expect(";")
        self.add_to_scope(name,declaration)

    def parse_struct_declaration(self):
        self.tokens.expect("struct")
        struct_name = self.tokens.get()
        name = self.tokens.get()
        self.tokens.expect(";")
        instance = self.scope[struct_name].instance()
        self.add_to_scope(name,instance)
        return instance

    def parse_global_declaration(self, type_, size, signed, const, name):
        instances = []
        while True:

            instance = self.parse_declaration(
                type_, 
                size, 
                signed, 
                const,
                name).instance()

            self.add_to_scope(name,instance)
            instances.append(instance)
            if self.tokens.peek() == ",":
                self.tokens.expect(",")
            else:
                break
            name = self.tokens.get()
        self.tokens.expect(";")
        return CompoundDeclaration(instances)

    def parse_compound_declaration(self):
        type_, size, signed, const = self.parse_type_specifier()
        instances = []
        while True:
            name = self.tokens.get()

            instance = self.parse_declaration(
                type_, 
                size, 
                signed, 
                const,
                name).instance()

            self.add_to_scope(name,instance)
            instances.append(instance)
            if self.tokens.peek() == ",":
                self.tokens.expect(",")
            else:
                break
            name = None
        self.tokens.expect(";")
        return CompoundDeclaration(instances)

    def parse_declaration(self, type_, size, signed, const, name):
        #struct declaration
        if type_ in self.structs:
            declaration = self.scope[type_]
        elif type_ in ["int", "float"]:
            #array declaration
            if self.tokens.peek() == "[":
                array_size = None
                self.tokens.expect("[")
                if self.tokens.peek() != "]":
                    size_expression = self.parse_ternary_expression()
                    if size_expression.type_() != "int":
                        self.tokens.error("Array size must be an integer like expression")
                    try:
                        array_size = size_expression.value()
                    except NotConstant:
                        self.tokens.error("Array size must be constant")

                self.tokens.expect("]")
                initializer = None
                if self.tokens.peek() == "=":
                    self.tokens.expect("=")
                    initializer = self.tokens.get()
                    initializer = [ord(i) for i in initializer.strip('"').decode("string_escape")] + [0]
                    array_size = len(initializer)
                if array_size is None:

                    self.tokens.error(
                        "array size must be specified if not initialized")

                array_type=type_+"[]"
                initialize_memory = self.initialize_memory
                declaration = ArrayDeclaration(
                    self.allocator,
                    array_size,
                    array_type,
                    type_,
                    size,
                    signed,
                    initializer,
                    self.initialize_memory)

            #simple variable declaration
            else:
                if self.tokens.peek() == "=":
                    self.tokens.expect("=")
                    initializer = self.parse_ternary_expression()
                else:
                    initializer = Constant(0, type_, size, signed)

                if type_ != initializer.type_():

                    if type_ == "int" and initializer.type_() == "float":
                        initializer = FloatToInt(initializer)
                    elif type_ == "float" and initializer.type_() == "int":
                        initializer = IntToFloat(initializer)
                    else:
                        self.tokens.error(
                            "type mismatch in intializer expected: %s actual: %s"%(
                                type_,
                                intitializer.type_()))
                declaration = VariableDeclaration(
                    self.allocator,
                    initializer,
                    name,
                    type_,
                    size,
                    signed,
                    const
                )

        return declaration

    def parse_expression(self):
        expression = self.parse_assignment()
        return expression

    def parse_ternary_expression(self):
        expression = constant_fold(self.parse_or_expression())
        while self.tokens.peek() in ["?"]:
            self.tokens.expect("?")
            true_expression = constant_fold(self.parse_or_expression())
            self.tokens.expect(":")
            false_expression = constant_fold(self.parse_or_expression())
            expression = OR(AND(expression, true_expression), false_expression)
        return expression

    def parse_or_expression(self):
        expression = self.parse_and_expression()
        while self.tokens.peek() in ["||"]:
            self.tokens.expect("||")
            expression = OR(expression, self.parse_and_expression())
        return expression

    def parse_and_expression(self):
        expression = self.parse_binary_expression(["|"])
        while self.tokens.peek() in ["&&"]:
            self.tokens.expect("&&")
            expression = AND(expression, self.parse_binary_expression(["|"]))
        return expression

    def substitute_function(self, binary_expression):

        """
        For some operations are more easily implemented in software.
        This function substitutes a call to the builtin library function.
        """

        functions = {
           "False,int,int,4,/" : "long_unsigned_divide_xxxx",
           "True,int,int,4,/" : "long_divide_xxxx",
           "False,int,int,2,/" : "unsigned_divide_xxxx",
           "True,int,int,2,/" : "divide_xxxx",
           "False,int,int,4,%" : "long_unsigned_modulo_xxxx",
           "True,int,int,4,%" : "long_modulo_xxxx",
           "False,int,int,2,%" : "unsigned_modulo_xxxx",
           "True,int,int,2,%" : "modulo_xxxx",
           "True,float,float,4,==" : "float_equal_xxxx",
           "True,float,float,4,!=" : "float_ne_xxxx",
           "True,float,float,4,<" : "float_lt_xxxx",
           "True,float,float,4,>" : "float_gt_xxxx",
           "True,float,float,4,<=" : "float_le_xxxx",
           "True,float,float,4,>=" : "float_ge_xxxx",
        }

        #select a function that matches the template.
        signature = ",".join([
            str(binary_expression.signed()), 
            binary_expression.left.type_(), 
            binary_expression.right.type_(), 
            str(binary_expression.size()), 
            binary_expression.operator])

        #Some things can't be implemented in verilog, substitute them with a function
        if signature in functions:
            function = self.scope[functions[signature]]
            function_call = FunctionCall(function)
            function_call.arguments = [binary_expression.left, binary_expression.right]
            return function_call
        else:
            return binary_expression

    def coerce_types(self, left, right):

        """
        Convert numeric types in expressions.
        """

        if left.type_() != right.type_():
            if left.type_() == "float" and right.type_() == "int":
                return left, IntToFloat(right)
            elif left.type_() == "int" and right.type_() == "float":
                return IntToFloat(left), right
            else:
                self.tokens.error("Incompatible types : %s %s"%(
                    left.type_(),
                    right.type_()))

        return left, right

    def parse_binary_expression(self, operators):
        operator_precedence = {
                "|": ["^"],
                "^": ["&"],
                "&": ["==", "!="],
                "==": ["<", ">", "<=", ">="],
                "<": ["<<", ">>"],
                "<<": ["+", "-"],
                "+": ["*", "/", "%"],
        }
        if operators[0] not in operator_precedence:
            left = self.parse_unary_expression()
            while self.tokens.peek() in operators:
                operator = self.tokens.get()
                right = self.parse_unary_expression()
                left, right = self.coerce_types(left, right)
                left = Binary(operator, left, right)
                left = self.substitute_function(left)
            return left
        else:
            next_operators = operator_precedence[operators[0]]
            left = self.parse_binary_expression(next_operators)
            while self.tokens.peek() in operators:
                operator = self.tokens.get()
                right = self.parse_binary_expression(next_operators)
                left, right = self.coerce_types(left, right)
                left = Binary(operator, left, right)
                left = self.substitute_function(left)
            return left

    def parse_unary_expression(self):

        if self.tokens.peek() == "!":
            operator = self.tokens.get()
            expression = self.parse_postfix_expression()

            if expression.type_() not in ["int"]:
                self.tokens.error(
                    "! is only valid for integer like expressions")

            return Binary("==", expression, Constant(0))

        elif self.tokens.peek() == "-":
            operator = self.tokens.get()
            expression = self.parse_postfix_expression()
            return Binary("-", Constant(0, 
                expression.type_(), 
                expression.size(), 
                expression.signed()), 
                expression)

        elif self.tokens.peek() == "~":
            operator = self.tokens.get()
            expression = self.parse_postfix_expression()

            if expression.type_() not in ["int"]:
                self.tokens.error(
                    "~ is only valid for integer like expressions")

            return Unary("~", expression)

        elif self.tokens.peek() == "sizeof":
            operator = self.tokens.get()
            expression = self.parse_unary_expression()
            return SizeOf(expression)

        else:
            return self.parse_postfix_expression()

    def parse_postfix_expression(self):
        expression = self.parse_paren_expression()
        while self.tokens.peek() in ["++", "--"]:
            operator = self.tokens.get()
            expression = PostIncrement(
                operator[:-1],
                expression,
                self.allocator
            )
        return expression

    def parse_paren_expression(self):
        if self.tokens.peek() == "(":
            self.tokens.expect("(")
            expression = self.parse_expression()
            self.tokens.expect(")")
        else:
            expression = self.parse_number_or_variable()
        return expression

    def parse_number_or_variable(self):
        if self.tokens.peek()[0].isalpha():
            name = self.tokens.get()
            if self.tokens.peek() == "(":
                return self.parse_function_call(name)
            else:
                return self.parse_variable(name)
        else:
            return self.parse_number()

    def parse_file_read(self):
        self.tokens.expect("(")
        file_name = self.tokens.get()
        file_name = file_name.strip('"').decode("string_escape")
        self.tokens.expect(")")
        return FileRead(file_name)

    def parse_file_write(self):
        self.tokens.expect("(")
        expression = self.parse_expression()
        self.tokens.expect(",")
        file_name = self.tokens.get()
        file_name = file_name.strip('"').decode("string_escape")
        self.tokens.expect(")")
        return FileWrite(file_name, expression)

    def parse_input(self, name):
        input_name = name.replace("input_", "")
        self.tokens.expect("(")
        type_ = "int"
        if self.tokens.peek() != ")":
            type_ = self.tokens.get()
            type_ = type_.strip('"').decode("string_escape")
        if type_ not in numeric_types:
            self.tokens.error("%s is not a numeric type"%type_)
        self.tokens.expect(")")
        return Input(input_name, type_)

    def parse_ready(self, name):
        input_name = name.replace("ready_", "")
        self.tokens.expect("(")
        self.tokens.expect(")")
        return Ready(input_name)

    def parse_output(self, name):
        output_name = name.replace("output_", "")
        self.tokens.expect("(")
        expression = self.parse_expression()
        self.tokens.expect(")")
        return Output(output_name, expression)

    def parse_function_call(self, name):
        if name.startswith("input_"):
            return self.parse_input(name)
        if name.startswith("ready_"):
            return self.parse_ready(name)
        if name.startswith("output_"):
            return self.parse_output(name)
        if name == "file_read":
            return self.parse_file_read()
        if name == "file_write":
            return self.parse_file_write()
        if name not in self.scope:
            self.tokens.error("Unknown function: %s"%name)
        function = self.scope[name]
        function_call = FunctionCall(function)
        function_call.arguments = []
        self.tokens.expect("(")
        while self.tokens.peek() != ")":
            function_call.arguments.append(self.parse_expression())
            if self.tokens.peek() == ",":
                self.tokens.expect(",")
            else:
                break
        self.tokens.expect(")")


        required_arguments = len(function_call.function.arguments)
        actual_arguments = len(function_call.arguments)
        if required_arguments != actual_arguments:
            self.tokens.error("Function %s takes %s arguments, %s given."%(
                name,
                len(function_call.function.arguments),
                len(function_call.arguments)))
        required_arguments = function_call.function.arguments
        actual_arguments = function_call.arguments
        corrected_arguments = []
        for required, actual in zip(required_arguments, actual_arguments):
            if not compatible(required, actual):
                if actual.type_() == "int" and required.type_() == "float":
                    actual = IntToFloat(actual)
                elif actual.type_() == "float" and required.type_() == "int":
                    actual = FloatToInt(actual)
                else:
                    self.tokens.error(
                        "type mismatch in assignment expected: %s, actual: %s"%(
                            required.type_(),
                            actual.type_()))
            corrected_arguments.append(actual)
        function_call.arguments = corrected_arguments

        return function_call

    def parse_number(self):
        token = self.tokens.get()
        type_ = "int"
        size = 2
        signed = True
        if token.startswith("'"):
            try:
                token = eval(token)
                value = ord(token)
            except SyntaxError:
                self.tokens.error("%s is not a character literal"%token)
        elif token.startswith('"'):
            try:
                initializer = [ord(i) for i in token.strip('"').decode("string_escape")] + [0]
                size = len(initializer)
                initialize_memory = self.initialize_memory
                declaration = ArrayDeclaration(
                    self.allocator,
                    size,
                    "int[]",
                    "int",
                    2,
                    False,
                    initializer,
                    self.initialize_memory)
                return ConstArray(declaration.instance())
            except SyntaxError:
                self.tokens.error("%s is not a character literal"%token)
        elif "." in token:
            #float literal
            try:
                type_ = "float"
                signed = True
                size = 4
                token = token.upper().replace("F", "")
                token = token.upper().replace("L", "")
                value = float(eval(token))

                try:
                    byte_value = struct.pack(">f", value)
                except OverflowError:
                    self.tokens.error("value too large")

            except SyntaxError:
                self.tokens.error("%s is not a floating point literal"%token)
        else:
            #integer literal
            try:
                if "U" in token.upper():
                    signed = False
                if "L" in token.upper():
                    size = 4
                token = token.upper().replace("U", "")
                value = int(eval(token))

                if signed:
                    if value > 2**((size * 8)-1) - 1:
                        self.tokens.error("value too large")
                    if value < -(2**((size * 8)-1)):
                        self.tokens.error("value too small")
                else:
                    if value > 2**(size * 8) - 1:
                        self.tokens.error("value too large")
                    if value < 0:
                        self.tokens.error("value too small")

            except SyntaxError:
                self.tokens.error("%s is not an integer literal"%token)

        return Constant(value, type_, size, signed)

    def parse_variable(self, name):
        if name not in self.scope:
            self.tokens.error("Unknown variable: %s"%name)
        instance = self.scope[name]
        return self.parse_variable_array_struct(instance)

    def parse_variable_array_struct(self, instance):
        if instance.type_() in numeric_types:

            if not hasattr(instance, "reference"):
                self.tokens.error(
                    "Not an expression")
               
            return Variable(instance)
        elif instance.type_().endswith("[]"):
            if self.tokens.peek() == "[":
                self.tokens.expect("[")
                index_expression = self.parse_expression()
                self.tokens.expect("]")
                if index_expression.type_() not in ["int"]:

                    self.tokens.error(
                        "Array indices must be an integer like expression")

                return ArrayIndex(instance, index_expression)
            else:
                return Array(instance)
        elif instance.type_().startswith("struct"):
            if self.tokens.peek() == ".":
                self.tokens.expect(".")
                member = self.tokens.get()
                instance = instance.members[member]
                return self.parse_variable_array_struct(instance)
            else:
                return Struct(instance)
Beispiel #39
0
class Parser:

    """Turn the C input file into a tree of expressions and statements."""

    def __init__(self, input_file, reuse):
        self.scope = {}
        self.function = None
        self.loop = None
        self.tokens = Tokens(input_file)
        self.allocator = Allocator(reuse)
        self.structs = []

    def parse_process(self):
        process = Process()
        process.allocator = self.allocator
        process.inputs = []
        process.outputs = []
        process.functions = []
        while not self.tokens.end():
            if self.tokens.peek() == "struct":
                self.parse_define_struct()
            elif self.tokens.peek() == "typedef":
                self.parse_typedef_struct()
            else:
                process.functions.append(self.parse_function())
        process.main = self.main
        return process

    def parse_function(self):
        function = Function()
        function.allocator = self.allocator
        stored_scope = self.scope
        type_ = self.tokens.get()
        name = self.tokens.get()
        
        #check if it is a global declaration
        if self.tokens.peek() != "(":
            if type_ not in ["int", "short", "long", "char"] + self.structs:
                self.tokens.error("unknown type")
            return self.parse_global_declaration(type_, name)

        #otherwise continue parsing a function
        self.tokens.expect("(")
        function.name = name
        function.type_ = type_
        function.return_address = self.allocator.new(function.name+" return address")
        if type_ not in ["int", "short", "long", "char", "void"]:
            self.tokens.error("unknown type")
        if type_ != "void":
            function.return_value = self.allocator.new(function.name+" return value")
        function.arguments = []
        while self.tokens.peek() != ")":
            type_ = self.tokens.get()
            if type_ not in ["int", "short", "long", "char"]:
                self.tokens.error("unknown type")
            argument = self.tokens.get()
            if self.tokens.peek() == "[":
                self.tokens.expect("[")
                self.tokens.expect("]")
                type_+="[]"
            function.arguments.append(Argument(argument, type_, self))
            if self.tokens.peek() == ",":
                self.tokens.expect(",")
            else:
                break
        self.tokens.expect(")")
        self.function = function
        function.statement = self.parse_statement()
        if type_ != "void" and not hasattr(function, "return_statement"):
            self.tokens.error("Function must have a return statement")
        self.function = None
        self.scope = stored_scope
        self.scope[function.name] = function
        #main thread is last function
        self.main = function
        return function

    def parse_break(self):
        break_ = Break()
        break_.loop = self.loop
        self.tokens.expect("break")
        self.tokens.expect(";")
        return break_

    def parse_continue(self):
        continue_ = Continue()
        continue_.loop = self.loop
        self.tokens.expect("continue")
        self.tokens.expect(";")
        return continue_

    def parse_return(self):
        return_ = Return()
        return_.function = self.function
        self.function.return_statement = return_
        self.tokens.expect("return")
        if hasattr(self.function, "return_value"):
            return_.expression = self.parse_expression()
        self.tokens.expect(";")
        return return_

    def parse_assert(self):
        assert_ = Assert()
        assert_.allocator = self.allocator
        self.tokens.expect("assert")
        self.tokens.expect("(")
        assert_.expression = self.parse_expression()
        self.tokens.expect(")")
        self.tokens.expect(";")
        assert_.line = self.tokens.lineno
        assert_.filename = self.tokens.filename
        return assert_

    def parse_report(self):
        report_ = Report()
        report_.allocator = self.allocator
        self.tokens.expect("report")
        self.tokens.expect("(")
        report_.expression = self.parse_expression()
        self.tokens.expect(")")
        self.tokens.expect(";")
        report_.line = self.tokens.lineno
        report_.filename = self.tokens.filename
        return report_

    def parse_wait_clocks(self):
        wait_clocks = WaitClocks()
        wait_clocks.allocator = self.allocator
        self.tokens.expect("wait_clocks")
        self.tokens.expect("(")
        wait_clocks.expression = self.parse_expression()
        self.tokens.expect(")")
        self.tokens.expect(";")
        wait_clocks.line = self.tokens.lineno
        return wait_clocks

    def parse_statement(self):
        if self.tokens.peek() in ["int", "short", "long", "char"] + self.structs:
            return self.parse_compound_declaration()
        elif self.tokens.peek() == "struct":
            return self.parse_struct_declaration()
        elif self.tokens.peek() == "if":
            return self.parse_if()
        elif self.tokens.peek() == "while":
            return self.parse_while()
        elif self.tokens.peek() == "for":
            return self.parse_for()
        elif self.tokens.peek() == "return":
            return self.parse_return()
        elif self.tokens.peek() == "break":
            return self.parse_break()
        elif self.tokens.peek() == "continue":
            return self.parse_continue()
        elif self.tokens.peek() == "{":
            return self.parse_block()
        elif self.tokens.peek() == "assert":
            return self.parse_assert()
        elif self.tokens.peek() == "report":
            return self.parse_report()
        elif self.tokens.peek() == "switch":
            return self.parse_switch()
        elif self.tokens.peek() == "case":
            return self.parse_case()
        elif self.tokens.peek() == "default":
            return self.parse_default()
        elif self.tokens.peek() == "wait_clocks":
            return self.parse_wait_clocks()
        else:
            expression = self.parse_discard()
            self.tokens.expect(";")
            return expression

    def parse_discard(self):
        return DiscardExpression(self.parse_expression(), self.allocator)

    def parse_assignment(self):
        assignment_operators = [
            "=", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "<<=", ">>=", 
            "++", "--"
        ]
        lvalue = self.parse_ternary_expression()
        if self.tokens.peek() in assignment_operators:
            if not hasattr(lvalue, "declaration"):
                self.tokens.error(
                    "left hand operand of assignment is not modifiable"
                )
            operator = self.tokens.get()
            if operator == "=":
                expression = self.parse_ternary_expression()
            elif operator in ["++", "--"]:
                expression = Binary(
                    operator[:-1],
                    lvalue, 
                    Constant(1), 
                    self.allocator
                )
            else:
                expression = Binary(
                    operator[:-1], 
                    lvalue, 
                    self.parse_ternary_expression(), 
                    self.allocator
                )
            if lvalue.type_ != expression.type_:
                self.tokens.error(
                    "type mismatch in assignment"
                )
            return Assignment(lvalue, expression, self.allocator)
        else:
            return lvalue

    def parse_if(self):
        if_ = If()
        if_.allocator = self.allocator
        self.tokens.expect("if")
        self.tokens.expect("(")
        if_.expression = self.parse_expression()
        if if_.expression.type_ not in ["int", "short", "long", "char"]:
            self.tokens.error(
                "if statement conditional must be an integer like expression"
            )
        self.tokens.expect(")")
        if_.true_statement = self.parse_statement()
        if self.tokens.peek() == "else":
            self.tokens.expect("else")
            if_.false_statement = self.parse_statement()
        else:
            if_.false_statement = None
        return if_

    def parse_switch(self):
        switch = Switch()
        switch.cases = {}
        self.tokens.expect("switch")
        self.tokens.expect("(")
        expression = self.parse_expression()
        if expression.type_ not in ["int", "short", "long", "char"]:
            self.tokens.error(
                "switch statement expression must be an integer like expression"
            )
        self.tokens.expect(")")
        stored_loop = self.loop
        self.loop = switch
        statement = self.parse_statement()
        self.loop = stored_loop
        switch.expression = expression
        switch.allocator = self.allocator
        switch.statement = statement
        return switch

    def parse_case(self):
        self.tokens.expect("case")
        expression = self.parse_expression()
        if expression.type_ not in ["int", "short", "long", "char"]:
            self.tokens.error(
                "case expression must be an integer like expression"
            )
        self.tokens.expect(":")
        try:
            expression = value(expression)
            case = Case()
            self.loop.cases[expression] =    case
        except NotConstant:
            self.tokens.error("case expression must be constant")
        except AttributeError:
            self.tokens.error(
                "case statements may only be use inside a switch statment"
            )
        return case

    def parse_default(self):
        self.tokens.expect("default")
        self.tokens.expect(":")
        default = Default()
        if not hasattr(self.loop, "cases"):
            self.tokens.error(
                "default statements may only be used inside a switch statment"
            )
        if hasattr(self.loop, "default"):
            self.tokens.error(
                "A switch statement may only have one default statement"
            )
        self.loop.default=default
        return default

    def parse_while(self):
        loop = Loop()
        self.tokens.expect("while")
        self.tokens.expect("(")
        expression = self.parse_expression()
        self.tokens.expect(")")
        stored_loop = self.loop
        self.loop = loop
        statement = self.parse_statement()
        self.loop = stored_loop

        if_ = If()
        loop.statement = if_
        break_ = Break()
        break_.loop = loop
        if_.allocator = self.allocator
        if expression.type_ not in ["int", "short", "long", "char"]:
            self.tokens.error(
                "if statement conditional must be an integer like expression"
            )
        if_.expression = expression
        if_.false_statement = break_
        if_.true_statement = statement

        return loop

    def parse_for(self):
        for_ = For()
        for_.allocator = self.allocator
        self.tokens.expect("for")
        self.tokens.expect("(")
        if self.tokens.peek() != ";":
            for_.statement1 = self.parse_discard()
        self.tokens.expect(";")
        if self.tokens.peek() != ";":
            for_.expression = self.parse_expression()
            if for_.expression.type_ not in ["int", "short", "long", "char"]:
                self.tokens.error(
                    "for statement conditional must be an integer like expression"
                )
        self.tokens.expect(";")
        if self.tokens.peek() != ")":
            for_.statement2 = self.parse_discard()
        self.tokens.expect(")")
        stored_loop = self.loop
        self.loop = for_
        for_.statement3 = self.parse_statement()
        self.loop = stored_loop
        return for_

    def parse_block(self):
        block = Block()
        stored_scope = self.scope
        self.tokens.expect("{")
        block.statements = []
        while self.tokens.peek() != "}":
            block.statements.append(self.parse_statement())
        self.tokens.expect("}")
        self.scope = stored_scope
        return block

    def parse_struct_body(self):
        self.tokens.expect("{")
        members = {}
        while self.tokens.peek() != "}":
            type_ = self.tokens.get()
            name = self.tokens.get()
            members[name] = self.parse_declaration(type_, name)
            self.tokens.expect(";")
        self.tokens.expect("}")
        return members

    def parse_typedef_struct(self):
        self.tokens.expect("typedef")
        self.tokens.expect("struct")
        declaration = StructDeclaration(self.parse_struct_body())
        name = self.tokens.get()
        self.tokens.expect(";")
        self.scope[name] = declaration
        self.structs.append(name)

    def parse_define_struct(self):
        self.tokens.expect("struct")
        name = self.tokens.get()
        declaration = StructDeclaration(self.parse_struct_body())
        self.tokens.expect(";")
        self.scope[name] = declaration

    def parse_struct_declaration(self):
        self.tokens.expect("struct")
        struct_name = self.tokens.get()
        name = self.tokens.get()
        self.tokens.expect(";")
        instance = self.scope[struct_name].instance()
        self.scope[name] = instance
        return instance

    def parse_global_declaration(self, type_, name):
        instances = []
        while True:
            instance = self.parse_declaration(type_, name).instance()
            self.scope[name] = instance
            instances.append(instance)
            if self.tokens.peek() == ",":
                self.tokens.expect(",")
            else:
                break
            name = self.tokens.get()
        self.tokens.expect(";")
        return CompoundDeclaration(instances)

    def parse_compound_declaration(self):
        type_ = self.tokens.get()
        instances = []
        while True:
            name = self.tokens.get()
            instance = self.parse_declaration(type_, name).instance()
            self.scope[name] = instance
            instances.append(instance)
            if self.tokens.peek() == ",":
                self.tokens.expect(",")
            else:
                break
            name = None
        self.tokens.expect(";")
        return CompoundDeclaration(instances)

    def parse_declaration(self, type_, name):
        #struct declaration
        if type_ in self.structs:
            declaration = self.scope[type_]
        elif type_ in ["int", "short", "long", "char"]:
            #array declaration 
            if self.tokens.peek() == "[":
                self.tokens.expect("[")
                size = self.tokens.get()
                self.tokens.expect("]")
                type_+="[]"
                declaration = ArrayDeclaration(self.allocator, size, type_)

            #simple variable declaration 
            else:
                if self.tokens.peek() == "=":
                    self.tokens.expect("=")
                    initializer = self.parse_ternary_expression()
                else:
                    initializer = Constant(0)
                declaration = VariableDeclaration(
                    self.allocator, 
                    initializer, 
                    name,
                    type_
                )

        return declaration

    def parse_expression(self):
        expression = self.parse_assignment()
        return expression

    def parse_ternary_expression(self):
        expression = constant_fold(self.parse_or_expression())
        while self.tokens.peek() in ["?"]:
            self.tokens.expect("?")
            true_expression = constant_fold(self.parse_or_expression())
            self.tokens.expect(":")
            false_expression = constant_fold(self.parse_or_expression())
            expression = OR(AND(expression, true_expression), false_expression)
        return expression

    def parse_or_expression(self):
        expression = self.parse_and_expression()
        while self.tokens.peek() in ["||"]:
            self.tokens.expect("||")
            expression = OR(expression, self.parse_and_expression())
        return expression

    def parse_and_expression(self):
        expression = self.parse_binary_expression(["|"])
        while self.tokens.peek() in ["&&"]:
            self.tokens.expect("&&")
            expression = AND(expression, self.parse_binary_expression(["|"]))
        return expression

    def parse_binary_expression(self, operators):
        operator_precedence = {
                "|": ["^"],
                "^": ["&"],
                "&": ["==", "!="],
                "==": ["<", ">", "<=", ">="],
                "<": ["<<", ">>"],
                "<<": ["+", "-"],
                "+": ["*", "/", "%"],
        }
        if operators[0] not in operator_precedence:
            expression = self.parse_unary_expression()
            while self.tokens.peek() in operators:
                expression = Binary(
                    self.tokens.get(), 
                    expression, 
                    self.parse_unary_expression(), 
                    self.allocator
                )
            return expression
        else:
            next_operators = operator_precedence[operators[0]]
            expression = self.parse_binary_expression(next_operators)
            while self.tokens.peek() in operators:
                expression = Binary(
                    self.tokens.get(), 
                    expression, 
                    self.parse_binary_expression(next_operators), 
                    self.allocator
                )
            return expression

    def parse_unary_expression(self):
        if self.tokens.peek() == "!":
            operator = self.tokens.get()
            expression = self.parse_paren_expression()
            return Binary("==", expression, Constant(0), self.allocator)
        elif self.tokens.peek() == "-":
            operator = self.tokens.get()
            expression = self.parse_paren_expression()
            return Binary("-", Constant(0), expression, self.allocator)
        elif self.tokens.peek() == "~":
            operator = self.tokens.get()
            expression = self.parse_paren_expression()
            return Unary("~", expression)
        else:
            return self.parse_paren_expression()

    def parse_paren_expression(self):
        if self.tokens.peek() == "(":
            self.tokens.expect("(")
            expression = self.parse_expression()
            self.tokens.expect(")")
        else:
            expression = self.parse_number_or_variable()
        return expression

    def parse_number_or_variable(self):
        if self.tokens.peek()[0].isalpha():
            name = self.tokens.get()
            if self.tokens.peek() == "(":
                return self.parse_function_call(name)
            else:
                return self.parse_variable(name)
        else:
            return self.parse_number()

    def parse_input(self, name):
        input_name = name.replace("input_", "")
        self.tokens.expect("(")
        self.tokens.expect(")")
        return Input(input_name)

    def parse_ready(self, name):
        input_name = name.replace("ready_", "")
        self.tokens.expect("(")
        self.tokens.expect(")")
        return Ready(input_name)

    def parse_output(self, name):
        output_name = name.replace("output_", "")
        self.tokens.expect("(")
        expression = self.parse_expression()
        self.tokens.expect(")")
        return Output(output_name, expression)

    def parse_function_call(self, name):
        if name.startswith("input_"):
            return self.parse_input(name)
        if name.startswith("ready_"):
            return self.parse_ready(name)
        if name.startswith("output_"):
            return self.parse_output(name)
        function_call = FunctionCall()
        function_call.arguments = []
        self.tokens.expect("(")
        while self.tokens.peek() != ")":
            function_call.arguments.append(self.parse_expression())
            if self.tokens.peek() == ",":
                self.tokens.expect(",")
            else:
                break
        self.tokens.expect(")")

        if name not in self.scope:
            self.tokens.error("Unknown function: %s"%name)

        function_call.function = self.scope[name]
        function_call.type_ = function_call.function.type_
        required_arguments = len(function_call.function.arguments)
        actual_arguments = len(function_call.arguments)
        if required_arguments != actual_arguments:
            self.tokens.error("Function %s takes %s arguments %s given."%(
                name,
                len(function_call.function.arguments),
                len(function_call.arguments)
            ))
        required_arguments = function_call.function.arguments
        actual_arguments = function_call.arguments
        for required, actual in zip(required_arguments, actual_arguments):
            if required.type_ != actual.type_:
                self.tokens.error("Type mismatch expected type : %s got: %s."%(
                    required.type_,
                    actual.type_
                ))


        return function_call

    def parse_number(self):
        token = self.tokens.get()
        if token.startswith("'"):
            try:
                value = ord(eval(token))
            except SyntaxError:
                self.tokens.error("%s is not a character literal"%token)
        else:
            try:
                value = int(eval(token))
            except SyntaxError:
                self.tokens.error("%s is not an integer literal"%token)
        return Constant(value)

    def parse_variable(self, name):
        if name not in self.scope:
            self.tokens.error("Unknown variable: %s"%name)
        instance = self.scope[name]
        return self.parse_variable_array_struct(instance)
 
    def parse_variable_array_struct(self, instance):
        if instance.type_ in ["int", "short", "long", "char"]:
            return Variable(instance, self.allocator)
        elif instance.type_.endswith("[]"):
            if self.tokens.peek() == "[":
                self.tokens.expect("[")
                index_expression = self.parse_expression()
                self.tokens.expect("]")
                if index_expression.type_ not in ["int", "short", "long", "char"]:
                    self.tokens.error(
                        "array indices must be an integer like expression"
                    )
                return ArrayIndex(instance, index_expression, self.allocator)
            else:
                return Array(instance, self.allocator)
        elif instance.type_ == "struct":
            self.tokens.expect(".")
            member = self.tokens.get()
            instance = instance.members[member]
            return self.parse_variable_array_struct(instance)