Ejemplo n.º 1
0
    splitQ = q.split(" ")
    splitA = re.split(r'\s+|[,;.-]\s*', a)

    features.append(len(splitQ))
    features.append(len(splitA))
    features.append(splitA[0].lower())
    features.append(splitA[len(splitA) - 1].lower())
    features.append(q)
    features.append(a)

    return features


if __name__ == '__main__':
    corpus = IqapReader('iqap-data.csv')
    nb = NaiveBayes()
    for item in corpus.dev_set():
        features = item.featurize()

        nb.addExample(item.majority_label(), features)

    print nb.classify(featurize("Have you ever had any bank accounts in Swiss banks, Mr. Bronston?", \
        "The company had an account there for about six months, in Zurich."))

    print nb.classify(featurize("Do you like cheese?", "Yes."))
    print nb.classify(featurize("Do you like cheese?", "No."))

    print nb.classify(featurize("Is the president right to support the new legislation?", \
        "The president\'s wrong on that one."))

    print nb.classify(featurize("Did you see her?", "She was sick."))
Ejemplo n.º 2
0
class ClueParser:
    def __init__(self):
        # TODO: if your implementation requires one or more trained classifiers (it probably does), you should declare it/them here.
        # Remember to import the class at the top of the file (from NaiveBayes import NaiveBayes)
        # e.g. self.classifier = NaiveBayes()
        self.classifier = NaiveBayes()

    def feature_extractor(self, clue):
        """Given a clue represented as a raw string of text, extract features of the clue and return them as a list or set."""
        # NOTE: this function isn't called by the evaluation script, so feel free to use it or not however you want.

        features = []
        # Example: add the length of the clue to the features (it's not very effective...)
        keyWords = ["spouse", "married", "wife", "husband", "college", "university in", "president of", "headquarters in", \
        "headquartered in", "born in", "parent organization", "parent company of", "mayor of", "university in", "born", "died"]
        for word in re.split(' ', clue):
            if word not in [
                    "the", "of", "a", "an", "but", "then", "to", "I", "you",
                    ".", "?"
            ]:
                if len(word) > 1 and (word[len(word) - 1] == "?"
                                      or word[len(word) - 1] == "."
                                      or word[len(word) - 1] == ","):
                    features.append(word[:-1])
                else:
                    features.append(word)
        for word in keyWords:
            if word in clue.split():
                features.append(word)

        # TODO Add more features!
        return features

    def train(self, clues, parsed_clues):
        """Trains the model on clues paired with gold standard parses."""
        # TODO: If your implementation of ClueParser uses any classifiers (it probably does), train them here
        klasses = [
            "wife_of", "husband_of", "college_of", "univ_president_of",
            "headquarters_loc", "born_in", "parent_org_of", "mayor_of",
            "univ_in", "year_of_birth", "year_of_death"
        ]
        labels = []
        features_list = []
        for i in xrange(len(clues)):
            label = parsed_clues[i].split(":")[0]
            labels.append(label)
            self.classifier.addExample(label, self.feature_extractor(clues[i]))
            features_list.append(self.feature_extractor(clues[i]))


#        self.classifier.crossValidate(features_list, labels)

    def parseClues(self, clues):
        """Parse each clue and return a list of parses, one for each clue."""
        parses = []
        for clue in clues:
            # TODO extract the clue relation and entity and append them to the list of parses
            clue_relation = self.classifier.classify(
                self.feature_extractor(clue))
            if clue_relation == "mayor_of" or clue_relation == "univ_in":
                entity = '<LOCATION>(.*?)</LOCATION>'
                keyWords = re.findall(entity, clue, flags=re.IGNORECASE)
                if len(keyWords) > 1:
                    clue_entity = keyWords[0] + ", " + keyWords[1]
                elif len(keyWords) == 1:
                    entity = '<LOCATION>(.*?)</LOCATION>, ([A-Z][A-Z])'
                    keyWords = re.findall(entity, clue, flags=re.IGNORECASE)
                    if len(keyWords) > 0:
                        clue_entity = keyWords[0][0] + ", " + keyWords[0][1]
                    else:
                        entity = '(.*?)'
                        clue_entity = re.findall(entity,
                                                 clue,
                                                 flags=re.IGNORECASE)[0]
                else:
                    entity = '(.*?)'
                    clue_entity = re.findall(entity, clue,
                                             flags=re.IGNORECASE)[0]

            elif clue_relation in ["univ_president_of", "parent_org_of"]:
                entity = '<ORGANIZATION>(.*?)</ORGANIZATION>'
                keyWords = re.findall(entity, clue, flags=re.IGNORECASE)
                if len(keyWords) > 0:
                    clue_entity = keyWords[0]
                else:
                    entity = '([A-Z].*?)'
                    clue_entity = re.findall(entity, clue,
                                             flags=re.IGNORECASE)[0]

            elif clue_relation in [
                    "wife_of", "husband_of", "college_of", "born_in",
                    "year_of_birth", "year_of_death"
            ]:
                entity = '<PERSON>(.*?)</PERSON>'
                keyWords = re.findall(entity, clue, flags=re.IGNORECASE)
                if len(keyWords) > 0:
                    clue_entity = keyWords[0]
                else:
                    entity = '([A-Z].*?)'
                    clue_entity = re.findall(entity, clue,
                                             flags=re.IGNORECASE)[0]
            else:
                entity = '>([A-Z].*?)<'
                keyWords = re.findall(entity, clue, flags=re.IGNORECASE)
                if len(keyWords) != 0:
                    clue_entity = keyWords[0]
                else:
                    entity = '(.*?)'
                    clue_entity = re.findall(entity, clue,
                                             flags=re.IGNORECASE)[0]

            parses.append(clue_relation + ':' + clue_entity)
        return parses

    #### You should not need to change anything after this point. ####

    def evaluate(self, parsed_clues, gold_parsed_clues):
        """Shows how the ClueParser model will score on the training/development data."""
        correct_relations = 0
        correct_parses = 0
        for parsed_clue, gold_parsed_clue in it.izip(parsed_clues,
                                                     gold_parsed_clues):
            split_parsed_clue = parsed_clue.split(":")
            split_gold_parsed_clue = gold_parsed_clue.split(":")
            # if parsed_clue != gold_parsed_clue:
            #     print split_parsed_clue
            #     print split_gold_parsed_clue
            if split_parsed_clue[0] == split_gold_parsed_clue[0]:
                correct_relations += 1
                if (split_parsed_clue[1] == split_gold_parsed_clue[1]
                        or split_parsed_clue[1]
                        == "The " + split_gold_parsed_clue[1]
                        or split_parsed_clue[1]
                        == "the " + split_gold_parsed_clue[1]):
                    correct_parses += 1
        print "Correct Relations: %d/%d" % (correct_relations,
                                            len(gold_parsed_clues))
        print "Correct Full Parses: %d/%d" % (correct_parses,
                                              len(gold_parsed_clues))
        print "Total Score: %d/%d" % (correct_relations + correct_parses,
                                      2 * len(gold_parsed_clues))