class ClueParser: def __init__(self): self.classifier = NaiveBayes() pass def findEntities(self, clue): entities = [] matches = re.findall(location_finder, clue) for match in matches: location = match[0] + ", " + match[1] entities.append(location) matches = re.findall(entity_finder, clue) for match in matches: entities.append(match) return entities def findCapEntity(self, clue): matches = re.findall(capital_finder, clue) if len(matches) != 0: return " ".join(matches) else: return "" def parseClues(self, clues): """Parse each clue and return a list of parses, one for each clue.""" parses = [] for clue in clues: klass = self.classifier.classify(self.findFeatures(clue)) entities = self.findEntities(clue) parse = klass + ":" if len(entities) != 0: parse += entities[-1] else: parse += self.findCapEntity(clue) parses.append(parse) return parses def findFeatures(self, clue): words = [] start = 0 i = 0 matches = re.findall(capital_remover, clue) sent = [] for match in matches: sent.append(match[0]) clue = "".join(sent) while i < len(clue): if clue[i] == "<": print clue if (clue[i] == " " or i == len(clue) - 1) and i != start: word = clue[start:i] if word[0] == " ": word = word[1:] words.append(word) start = i + 1 i += 1 return words def train(self, clues, parsed_clues): """Trains the model on clues paired with gold standard parses.""" klasses = [] for answer in parsed_clues: klass = answer[:answer.index(":")] klasses.append(klass) features = [] for clue in clues: features.append(self.findFeatures(clue)) self.classifier.addExamples(features, klasses) def evaluate(self, parsed_clues, gold_parsed_clues): """Shows how the ClueParser model will score on the training/development data.""" correct_relations = 0 correct_parses = 0 for parsed_clue, gold_parsed_clue in it.izip(parsed_clues, gold_parsed_clues): split_parsed_clue = parsed_clue.split(":") split_gold_parsed_clue = gold_parsed_clue.split(":") if split_parsed_clue[0] == split_gold_parsed_clue[0]: correct_relations += 1 if (split_parsed_clue[1] == split_gold_parsed_clue[1] or split_parsed_clue[1] == "The " + split_gold_parsed_clue[1] or split_parsed_clue[1] == "the " + split_gold_parsed_clue[1]): correct_parses += 1 print "Correct Relations: %d/%d" % (correct_relations, len(gold_parsed_clues)) print "Correct Full Parses: %d/%d" % (correct_parses, len(gold_parsed_clues)) print "Total Score: %d/%d" % (correct_relations + correct_parses, 2 * len(gold_parsed_clues))
class ClueParser: def __init__(self): # TODO: if your implementation requires one or more trained classifiers (it probably does), you should declare it/them here. # Remember to import the class at the top of the file (from NaiveBayes import NaiveBayes) # e.g. self.classifier = NaiveBayes() self.classifer = NaiveBayes() pass def feature_extractor(self, clue): """Given a clue represented as a raw string of text, extract features of the clue and return them as a list or set.""" features = [] words = clue.split() for word in words: features.append(word) special_words = [ "married", "college", "university", "president", "based", "born", "mayor", "located", "year", "died", "birthplace", "birth" ] for word in special_words: if word in clue: features.append("1") else: features.append("0") return features def train(self, clues, parsed_clues): """Trains the model on clues paired with gold standard parses.""" featuresList = [] relations = [] for clue in clues: featuresList.append(self.feature_extractor(clue)) for clue in parsed_clues: relations.append(clue[:clue.find(":")]) self.classifer.addExamples(featuresList, relations) def getMatch(self, clue, pattern, relation): matches = re.findall(pattern, clue) specialWords = [ "is", "was", "married", "is married", "president", "leader", "head", "founder", "husband", "marriage", "college", "university", "degree", "wife", "has been", "in charge of", "based", "headquartered", "headquared", "run", "located", "is in", "led", "headed" ] words = [] for m in matches: if isinstance(m, tuple): for index, elem in enumerate(m): if elem not in specialWords and len(elem) > 0: words.append(elem) else: return m if len(words) > 1 and words[0] != words[1]: return words[0] + ", " + words[1] elif len(words) > 0: return words[0] return 'no match' def getEntity(self, relation, clue): pattern = "" if relation == "wife_of": pattern = 'married [\sa-zA-Z]*<PERSON>([\.A-Za-z\s\']*)</PERSON>|wife of [\sa-zA-Z]*<PERSON>([\.A-Za-z\s\']*)</PERSON>|<PERSON>([.A-Za-z\s]*)</PERSON> married|wife is [\sA-Za-z]*<PERSON>([\.\'A-Za-z\s]*)</PERSON>|<PERSON>([\.\'A-Za-z\s]*)</PERSON>\'s [\sA-Za-z]*(wife|marriage)|the wedding between her and [\sA-Za-z]*<PERSON>([\.\'A-Za-z\s]*)</PERSON>|<PERSON>([\.\'A-Za-z\s]*)</PERSON> (is married|married)' elif relation == "husband_of": pattern = 'married [\sa-zA-Z<>/]*<PERSON>([\.A-Za-z\s\']*)</PERSON>|husband of [\sa-zA-Z<>/]*<PERSON>([\.A-Za-z\s\']*)</PERSON>|<PERSON>([.A-Za-z\s]*)</PERSON> married|wife is [\sA-Za-z<>/]*<PERSON>([\.\'A-Za-z\s]*)</PERSON>|<PERSON>([\.\'A-Za-z\s]*)</PERSON>\'s [\sA-Za-z<>/]*(husband|marriage)|the wedding between him and [\sA-Za-z<>/]*<PERSON>([\.\'A-Za-z\s]*)</PERSON>|<PERSON>([\.\'A-Za-z\s]*)</PERSON> (is married|married)|<PERSON>([\.A-Za-z\s\']*)</PERSON> met this man' elif relation == "college_of": if clue.count("PERSON") > 2: pattern = '<PERSON>([\.A-Za-z\s\']*)</PERSON>\'s alma mater|<PERSON>([\.A-Za-z\s\']*)</PERSON>[\sa-zA-Z]*(college|university|degree)|alma mater of <PERSON>([\.A-Za-z\s\']*)</PERSON>|to [\sa-zA-Z]*<PERSON>([\.A-Za-z\s\']*)</PERSON>' else: pattern = '<PERSON>([\.A-Za-z\s\']*)</PERSON>' elif relation == "univ_president_of": if clue.count("ORGANIZATION") > 2: pattern = "<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> (is|was|has been) [\sa-zA-Z]*(headed|led|founded)|<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION>\'s [\sa-zA-Z]*(president|leader|head|founder)|(president|leader|head|founder)[\sA-Za-z<>/\']*of [\sA-Za-z<>/\']*<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION>|<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> (president|leader|head|founder)|(headed|led|founded|in charge of) <ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION>" else: pattern = "<ORGANIZATION>([\.A-Za-z\s\'&]*)</ORGANIZATION>" elif relation == "headquarters_loc": if clue.count("ORGANIZATION") > 2: pattern = "<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> [\sa-zA-Z]*(based|headquartered|headquared|run|located|is in)|headquarters of <ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION>|([&\.A-Za-z\s\']*) is run|([\.A-Za-z\s\'&]*) [\sa-zA-Z]* based|([&\.A-Za-z\s\']*) is in|([&\.A-Za-z\s\']*)'s [\sa-zA-Z]* office|([&\.A-Za-z\s\']*)'s [\sa-zA-Z]* headquarters" else: pattern = "<ORGANIZATION>([\.A-Za-z\s\']*)</ORGANIZATION>|([&\.A-Za-z\s\']*) is run|([\.A-Za-z\s\'&]*) [\sa-zA-Z]* based|([&\.A-Za-z\s\']*) is in|([&\.A-Za-z\s\']*)'s [\sa-zA-Z]* office|([&\.A-Za-z\s\']*)'s [\sa-zA-Z]* headquarters" elif relation == "born_in": if clue.count("PERSON") > 2: pattern = "<PERSON>([\.A-Za-z\s\']*)</PERSON>[\sa-zA-Z,<>/]* born [\sa-zA-Z,]*in|birthplace [\sa-zA-Z,<>/]*<PERSON>([\.A-Za-z\s\']*)</PERSON>" else: pattern = "<PERSON>([\.A-Za-z\s\']*)</PERSON>" elif relation == "parent_org_of": pattern = "(([A-Z][&\.A-Za-z\s\']*)+) (is|was)[\sa-zA-Z<>/1-9,]*offshoot|(([A-Z][&\.A-Za-z\s\']*)+) (is|was) [\sa-zA-Z<>/1-9,]*organization|(([A-Z][&\.A-Za-z\s\']*)+) (is|was) [\sa-zA-Z<>/1-9,]*orginazation|parent [\sa-zA-Z]*<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION>|<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> (is|was) [\sa-zA-Z]*offshoot|<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> (is|was) [\sa-zA-Z]*organization|parent \sa-zA-Z]*(([A-Z][&\.A-Za-z\s\']*)+)|ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> (is|was) [\sa-zA-Z]*orginazation|parent [\sa-zA-Z<>/]*(([A-Z][&\.A-Za-z\s\']*)+)" elif relation == "mayor_of": if clue.count("LOCATION") < 4: pattern = "<LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*)'s mayor|mayor of <LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*)|<LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*)[\sa-zA-Z<>/1-9,]*is[\sa-zA-Z<>/1-9,]*(led|headed|run) by|mayor of ([A-Za-z]*), <LOCATION>([A-Za-z\s]*)</LOCATION>|([A-Za-z]*), <LOCATION>([A-Za-z\s]*)</LOCATION>[\sa-zA-Z<>/1-9,]*is[\sa-zA-Z<>/1-9,]*(led|headed|run) |mayor of ([A-Za-z]*), ([A-Za-z]*)" else: pattern = "<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> [\sa-zA-Z<>/1-9,]*is led by|mayor of <LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>'s mayor|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>[\sa-zA-Z<>/1-9,]* is [\sa-zA-Z<>/1-9,]*(headed|led|run)|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>'s mayor" elif relation == "univ_in": pattern = "school [\sa-zA-Z<>/1-9,]*in <LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>|college [\sa-zA-Z<>/1-9,]*in <LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>|university [\sa-zA-Z<>/1-9,]*in <LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> [\sa-zA-Z<>/1-9,]*school|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> [\sa-zA-Z<>/1-9,]*college|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> [\sa-zA-Z<>/1-9,]*university|<LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*) is home|([A-Za-z]*), <LOCATION>([A-Za-z\s]*)</LOCATION> is home|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> in|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> is home|based in <LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>|located in <LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*)|in <LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*)" elif relation == "year_of_birth": pattern = "<PERSON>([\.A-Za-z\s\']*)</PERSON>[\sa-zA-Z<>/1-9,]*(was|is) born|<PERSON>([\.A-Za-z\s\']*)</PERSON>'s birthday|birthday of [\sa-zA-Z1-9,]*<PERSON>([\.A-Za-z\s\']*)</PERSON>" elif relation == "year_of_death": pattern = "<PERSON>([\.A-Za-z\s\']*)</PERSON>[\sa-zA-Z<>/1-9,]*died|<PERSON>([\.A-Za-z\s\']*)</PERSON>[\sa-zA-Z<>/1-9,]*passed" if len(pattern) > 0: answer = self.getMatch(clue, pattern, relation) if answer == "no match" and relation != "mayor_of" and relation != "univ_in": stop_index = clue.find("</") start_index = clue.find(">") answer = clue[start_index + 1:stop_index] if answer == "no match" and clue.count("</") > 1 and ( relation == "mayor_of" or relation == "univ_in"): stop1 = clue.find("</") stop2 = clue.find("</", stop1 + 2) start1 = clue.find(">") temp = clue.find(">", start1 + 1) start2 = clue.find(">", temp + 1) answer = clue[start1 + 1:stop1] + ", " + clue[start2 + 1:stop2] return answer return 'Gene Autry' def parseClues(self, clues): """Parse each clue and return a list of parses, one for each clue.""" parses = [] for clue in clues: feature = self.feature_extractor(clue) clue_relation = self.classifer.classify(feature) clue_entity = self.getEntity(clue_relation, clue) # if clue_entity == "no match": # print(clue + " " + clue_relation) parses.append(clue_relation + ':' + clue_entity) return parses #### You should not need to change anything after this point. #### def evaluate(self, parsed_clues, gold_parsed_clues): """Shows how the ClueParser model will score on the training/development data.""" correct_relations = 0 correct_parses = 0 for parsed_clue, gold_parsed_clue in it.izip(parsed_clues, gold_parsed_clues): split_parsed_clue = parsed_clue.split(":") split_gold_parsed_clue = gold_parsed_clue.split(":") if split_parsed_clue[0] == split_gold_parsed_clue[0]: correct_relations += 1 if (split_parsed_clue[1] == split_gold_parsed_clue[1] or split_parsed_clue[1] == "The " + split_gold_parsed_clue[1] or split_parsed_clue[1] == "the " + split_gold_parsed_clue[1]): correct_parses += 1 print "Correct Relations: %d/%d" % (correct_relations, len(gold_parsed_clues)) print "Correct Full Parses: %d/%d" % (correct_parses, len(gold_parsed_clues)) print "Total Score: %d/%d" % (correct_relations + correct_parses, 2 * len(gold_parsed_clues))
datapoints = tdatapoints classes = tclasses print "reverting to stateless mode" else: pickle.dump( datapoints, open( "save.p", "wb" ) ) pickle.dump( classes, open( "save.p1", "wb" ) ) print len(datapoints) print len(classes) if('train' in sys.argv[1]): print 'training done model saved to save.p and save.p1' sys.exit (0) nb.addExamples(datapoints,classes) tests=[] tests.append(getFeatures('Your graphs were wrong. Please correct it')) tests.append(getFeatures('thanks john :) I dont get it')) tests.append(getFeatures("thanks for your answer")) tests.append(getFeatures('I appreciate your honest response. thanks')) tests.append(getFeatures('I am not sure how to implement this. Can a TA please clarify? We have been stuck on this problem for quite awhile')) tests.append(getFeatures("I love them. I can't work on it now. I'm totally confused")) tests.append(getFeatures("Is the 3 page length a hard limit. It was not clear in the assignment specification")) tests.append(getFeatures(""" I have no clue about what this class is about. """)) tests.append(getFeatures(