Ejemplo n.º 1
0
def main(argv):

	trainfile = ''
	testfile  = ''
	mode      = ''

	# validate input
	if len(sys.argv) == 4:
		trainfile = sys.argv[1]
		testfile  = sys.argv[2]
		mode      = sys.argv[3]
	else:
		print("incorrect input supplied")
		sys.exit()


	# ingest data
	trainset = readFile(trainfile)
	testset  = readFile(testfile)

	
	
	# train on random subsets of the data
	sizes = [25, 50, 100]
	accs = []
	for size in sizes:
		tmpAcc = []
		for j in range(4):
			tmpSet = random.sample(trainset.instances, size)

			bayes = NaiveBayes(trainset, testset)
			bayes.train(tmpSet)

			preds = bayes.classify(testset.instances)

			corCount = 0
			for i in range(len(preds)):
				#print preds[i][0], testset.instances[i][-1], preds[i][1]
				if preds[i][0] == testset.instances[i][-1]:
					corCount += 1

			print size, j, corCount
			tmpAcc.append(corCount)
		meanAcc = float(sum(tmpAcc)) / len(tmpAcc)
		accs.append([size, meanAcc])
Ejemplo n.º 2
0
    splitQ = q.split(" ")
    splitA = re.split(r'\s+|[,;.-]\s*', a)

    features.append(len(splitQ))
    features.append(len(splitA))
    features.append(splitA[0].lower())
    features.append(splitA[len(splitA) - 1].lower())
    features.append(q)
    features.append(a)

    return features


if __name__ == '__main__':
    corpus = IqapReader('iqap-data.csv')
    nb = NaiveBayes()
    for item in corpus.dev_set():
        features = item.featurize()

        nb.addExample(item.majority_label(), features)

    print nb.classify(featurize("Have you ever had any bank accounts in Swiss banks, Mr. Bronston?", \
        "The company had an account there for about six months, in Zurich."))

    print nb.classify(featurize("Do you like cheese?", "Yes."))
    print nb.classify(featurize("Do you like cheese?", "No."))

    print nb.classify(featurize("Is the president right to support the new legislation?", \
        "The president\'s wrong on that one."))

    print nb.classify(featurize("Did you see her?", "She was sick."))
Ejemplo n.º 3
0
class ClueParser:
    def __init__(self):
        # TODO: if your implementation requires one or more trained classifiers (it probably does), you should declare it/them here.
        # Remember to import the class at the top of the file (from NaiveBayes import NaiveBayes)
        # e.g. self.classifier = NaiveBayes()
        self.classifier = NaiveBayes()

    def feature_extractor(self, clue):
        """Given a clue represented as a raw string of text, extract features of the clue and return them as a list or set."""
        # NOTE: this function isn't called by the evaluation script, so feel free to use it or not however you want.

        features = []
        # Example: add the length of the clue to the features (it's not very effective...)
        keyWords = ["spouse", "married", "wife", "husband", "college", "university in", "president of", "headquarters in", \
        "headquartered in", "born in", "parent organization", "parent company of", "mayor of", "university in", "born", "died"]
        for word in re.split(' ', clue):
            if word not in [
                    "the", "of", "a", "an", "but", "then", "to", "I", "you",
                    ".", "?"
            ]:
                if len(word) > 1 and (word[len(word) - 1] == "?"
                                      or word[len(word) - 1] == "."
                                      or word[len(word) - 1] == ","):
                    features.append(word[:-1])
                else:
                    features.append(word)
        for word in keyWords:
            if word in clue.split():
                features.append(word)

        # TODO Add more features!
        return features

    def train(self, clues, parsed_clues):
        """Trains the model on clues paired with gold standard parses."""
        # TODO: If your implementation of ClueParser uses any classifiers (it probably does), train them here
        klasses = [
            "wife_of", "husband_of", "college_of", "univ_president_of",
            "headquarters_loc", "born_in", "parent_org_of", "mayor_of",
            "univ_in", "year_of_birth", "year_of_death"
        ]
        labels = []
        features_list = []
        for i in xrange(len(clues)):
            label = parsed_clues[i].split(":")[0]
            labels.append(label)
            self.classifier.addExample(label, self.feature_extractor(clues[i]))
            features_list.append(self.feature_extractor(clues[i]))


#        self.classifier.crossValidate(features_list, labels)

    def parseClues(self, clues):
        """Parse each clue and return a list of parses, one for each clue."""
        parses = []
        for clue in clues:
            # TODO extract the clue relation and entity and append them to the list of parses
            clue_relation = self.classifier.classify(
                self.feature_extractor(clue))
            if clue_relation == "mayor_of" or clue_relation == "univ_in":
                entity = '<LOCATION>(.*?)</LOCATION>'
                keyWords = re.findall(entity, clue, flags=re.IGNORECASE)
                if len(keyWords) > 1:
                    clue_entity = keyWords[0] + ", " + keyWords[1]
                elif len(keyWords) == 1:
                    entity = '<LOCATION>(.*?)</LOCATION>, ([A-Z][A-Z])'
                    keyWords = re.findall(entity, clue, flags=re.IGNORECASE)
                    if len(keyWords) > 0:
                        clue_entity = keyWords[0][0] + ", " + keyWords[0][1]
                    else:
                        entity = '(.*?)'
                        clue_entity = re.findall(entity,
                                                 clue,
                                                 flags=re.IGNORECASE)[0]
                else:
                    entity = '(.*?)'
                    clue_entity = re.findall(entity, clue,
                                             flags=re.IGNORECASE)[0]

            elif clue_relation in ["univ_president_of", "parent_org_of"]:
                entity = '<ORGANIZATION>(.*?)</ORGANIZATION>'
                keyWords = re.findall(entity, clue, flags=re.IGNORECASE)
                if len(keyWords) > 0:
                    clue_entity = keyWords[0]
                else:
                    entity = '([A-Z].*?)'
                    clue_entity = re.findall(entity, clue,
                                             flags=re.IGNORECASE)[0]

            elif clue_relation in [
                    "wife_of", "husband_of", "college_of", "born_in",
                    "year_of_birth", "year_of_death"
            ]:
                entity = '<PERSON>(.*?)</PERSON>'
                keyWords = re.findall(entity, clue, flags=re.IGNORECASE)
                if len(keyWords) > 0:
                    clue_entity = keyWords[0]
                else:
                    entity = '([A-Z].*?)'
                    clue_entity = re.findall(entity, clue,
                                             flags=re.IGNORECASE)[0]
            else:
                entity = '>([A-Z].*?)<'
                keyWords = re.findall(entity, clue, flags=re.IGNORECASE)
                if len(keyWords) != 0:
                    clue_entity = keyWords[0]
                else:
                    entity = '(.*?)'
                    clue_entity = re.findall(entity, clue,
                                             flags=re.IGNORECASE)[0]

            parses.append(clue_relation + ':' + clue_entity)
        return parses

    #### You should not need to change anything after this point. ####

    def evaluate(self, parsed_clues, gold_parsed_clues):
        """Shows how the ClueParser model will score on the training/development data."""
        correct_relations = 0
        correct_parses = 0
        for parsed_clue, gold_parsed_clue in it.izip(parsed_clues,
                                                     gold_parsed_clues):
            split_parsed_clue = parsed_clue.split(":")
            split_gold_parsed_clue = gold_parsed_clue.split(":")
            # if parsed_clue != gold_parsed_clue:
            #     print split_parsed_clue
            #     print split_gold_parsed_clue
            if split_parsed_clue[0] == split_gold_parsed_clue[0]:
                correct_relations += 1
                if (split_parsed_clue[1] == split_gold_parsed_clue[1]
                        or split_parsed_clue[1]
                        == "The " + split_gold_parsed_clue[1]
                        or split_parsed_clue[1]
                        == "the " + split_gold_parsed_clue[1]):
                    correct_parses += 1
        print "Correct Relations: %d/%d" % (correct_relations,
                                            len(gold_parsed_clues))
        print "Correct Full Parses: %d/%d" % (correct_parses,
                                              len(gold_parsed_clues))
        print "Total Score: %d/%d" % (correct_relations + correct_parses,
                                      2 * len(gold_parsed_clues))
Ejemplo n.º 4
0
from Preprocessing import Preprocessing
from NaiveBayes import NaiveBayes

if __name__ == '__main__':
	prep = Preprocessing()
	prep.convert_all_files()
	#prep.print_dictionary()

	naiveb = NaiveBayes(prep.get_dictionary(), prep.get_class_distribuition())
	prep2 = Preprocessing()
	class_found = naiveb.classify(prep2.execute('processedFiles/ILP-1314Seb105-126.txt'))
	print(class_found)
Ejemplo n.º 5
0
class ClueParser:
    def __init__(self):
        # TODO: if your implementation requires one or more trained classifiers (it probably does), you should declare it/them here.
        # Remember to import the class at the top of the file (from NaiveBayes import NaiveBayes)
        # e.g. self.classifier = NaiveBayes()
        self.classifer = NaiveBayes()
        pass

    def feature_extractor(self, clue):
        """Given a clue represented as a raw string of text, extract features of the clue and return them as a list or set."""
        features = []
        words = clue.split()
        for word in words:
            features.append(word)
        special_words = [
            "married", "college", "university", "president", "based", "born",
            "mayor", "located", "year", "died", "birthplace", "birth"
        ]
        for word in special_words:
            if word in clue:
                features.append("1")
            else:
                features.append("0")
        return features

    def train(self, clues, parsed_clues):
        """Trains the model on clues paired with gold standard parses."""
        featuresList = []
        relations = []
        for clue in clues:
            featuresList.append(self.feature_extractor(clue))
        for clue in parsed_clues:
            relations.append(clue[:clue.find(":")])
        self.classifer.addExamples(featuresList, relations)

    def getMatch(self, clue, pattern, relation):
        matches = re.findall(pattern, clue)
        specialWords = [
            "is", "was", "married", "is married", "president", "leader",
            "head", "founder", "husband", "marriage", "college", "university",
            "degree", "wife", "has been", "in charge of", "based",
            "headquartered", "headquared", "run", "located", "is in", "led",
            "headed"
        ]
        words = []
        for m in matches:
            if isinstance(m, tuple):
                for index, elem in enumerate(m):
                    if elem not in specialWords and len(elem) > 0:
                        words.append(elem)
            else:
                return m
        if len(words) > 1 and words[0] != words[1]:
            return words[0] + ", " + words[1]
        elif len(words) > 0:
            return words[0]
        return 'no match'

    def getEntity(self, relation, clue):
        pattern = ""
        if relation == "wife_of":
            pattern = 'married [\sa-zA-Z]*<PERSON>([\.A-Za-z\s\']*)</PERSON>|wife of [\sa-zA-Z]*<PERSON>([\.A-Za-z\s\']*)</PERSON>|<PERSON>([.A-Za-z\s]*)</PERSON> married|wife is [\sA-Za-z]*<PERSON>([\.\'A-Za-z\s]*)</PERSON>|<PERSON>([\.\'A-Za-z\s]*)</PERSON>\'s [\sA-Za-z]*(wife|marriage)|the wedding between her and [\sA-Za-z]*<PERSON>([\.\'A-Za-z\s]*)</PERSON>|<PERSON>([\.\'A-Za-z\s]*)</PERSON> (is married|married)'
        elif relation == "husband_of":
            pattern = 'married [\sa-zA-Z<>/]*<PERSON>([\.A-Za-z\s\']*)</PERSON>|husband of [\sa-zA-Z<>/]*<PERSON>([\.A-Za-z\s\']*)</PERSON>|<PERSON>([.A-Za-z\s]*)</PERSON> married|wife is [\sA-Za-z<>/]*<PERSON>([\.\'A-Za-z\s]*)</PERSON>|<PERSON>([\.\'A-Za-z\s]*)</PERSON>\'s [\sA-Za-z<>/]*(husband|marriage)|the wedding between him and [\sA-Za-z<>/]*<PERSON>([\.\'A-Za-z\s]*)</PERSON>|<PERSON>([\.\'A-Za-z\s]*)</PERSON> (is married|married)|<PERSON>([\.A-Za-z\s\']*)</PERSON> met this man'
        elif relation == "college_of":
            if clue.count("PERSON") > 2:
                pattern = '<PERSON>([\.A-Za-z\s\']*)</PERSON>\'s alma mater|<PERSON>([\.A-Za-z\s\']*)</PERSON>[\sa-zA-Z]*(college|university|degree)|alma mater of <PERSON>([\.A-Za-z\s\']*)</PERSON>|to [\sa-zA-Z]*<PERSON>([\.A-Za-z\s\']*)</PERSON>'
            else:
                pattern = '<PERSON>([\.A-Za-z\s\']*)</PERSON>'
        elif relation == "univ_president_of":
            if clue.count("ORGANIZATION") > 2:
                pattern = "<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> (is|was|has been) [\sa-zA-Z]*(headed|led|founded)|<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION>\'s [\sa-zA-Z]*(president|leader|head|founder)|(president|leader|head|founder)[\sA-Za-z<>/\']*of [\sA-Za-z<>/\']*<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION>|<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> (president|leader|head|founder)|(headed|led|founded|in charge of) <ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION>"
            else:
                pattern = "<ORGANIZATION>([\.A-Za-z\s\'&]*)</ORGANIZATION>"
        elif relation == "headquarters_loc":
            if clue.count("ORGANIZATION") > 2:
                pattern = "<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> [\sa-zA-Z]*(based|headquartered|headquared|run|located|is in)|headquarters of <ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION>|([&\.A-Za-z\s\']*) is run|([\.A-Za-z\s\'&]*) [\sa-zA-Z]* based|([&\.A-Za-z\s\']*) is in|([&\.A-Za-z\s\']*)'s [\sa-zA-Z]* office|([&\.A-Za-z\s\']*)'s [\sa-zA-Z]* headquarters"
            else:
                pattern = "<ORGANIZATION>([\.A-Za-z\s\']*)</ORGANIZATION>|([&\.A-Za-z\s\']*) is run|([\.A-Za-z\s\'&]*) [\sa-zA-Z]* based|([&\.A-Za-z\s\']*) is in|([&\.A-Za-z\s\']*)'s [\sa-zA-Z]* office|([&\.A-Za-z\s\']*)'s [\sa-zA-Z]* headquarters"
        elif relation == "born_in":
            if clue.count("PERSON") > 2:
                pattern = "<PERSON>([\.A-Za-z\s\']*)</PERSON>[\sa-zA-Z,<>/]* born [\sa-zA-Z,]*in|birthplace [\sa-zA-Z,<>/]*<PERSON>([\.A-Za-z\s\']*)</PERSON>"
            else:
                pattern = "<PERSON>([\.A-Za-z\s\']*)</PERSON>"
        elif relation == "parent_org_of":
            pattern = "(([A-Z][&\.A-Za-z\s\']*)+) (is|was)[\sa-zA-Z<>/1-9,]*offshoot|(([A-Z][&\.A-Za-z\s\']*)+) (is|was) [\sa-zA-Z<>/1-9,]*organization|(([A-Z][&\.A-Za-z\s\']*)+) (is|was) [\sa-zA-Z<>/1-9,]*orginazation|parent [\sa-zA-Z]*<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION>|<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> (is|was) [\sa-zA-Z]*offshoot|<ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> (is|was) [\sa-zA-Z]*organization|parent \sa-zA-Z]*(([A-Z][&\.A-Za-z\s\']*)+)|ORGANIZATION>([&\.A-Za-z\s\']*)</ORGANIZATION> (is|was) [\sa-zA-Z]*orginazation|parent [\sa-zA-Z<>/]*(([A-Z][&\.A-Za-z\s\']*)+)"
        elif relation == "mayor_of":
            if clue.count("LOCATION") < 4:
                pattern = "<LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*)'s mayor|mayor of <LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*)|<LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*)[\sa-zA-Z<>/1-9,]*is[\sa-zA-Z<>/1-9,]*(led|headed|run) by|mayor of ([A-Za-z]*), <LOCATION>([A-Za-z\s]*)</LOCATION>|([A-Za-z]*), <LOCATION>([A-Za-z\s]*)</LOCATION>[\sa-zA-Z<>/1-9,]*is[\sa-zA-Z<>/1-9,]*(led|headed|run) |mayor of ([A-Za-z]*), ([A-Za-z]*)"
            else:
                pattern = "<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> [\sa-zA-Z<>/1-9,]*is led by|mayor of <LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>'s mayor|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>[\sa-zA-Z<>/1-9,]* is [\sa-zA-Z<>/1-9,]*(headed|led|run)|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>'s mayor"
        elif relation == "univ_in":
            pattern = "school [\sa-zA-Z<>/1-9,]*in <LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>|college [\sa-zA-Z<>/1-9,]*in <LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>|university [\sa-zA-Z<>/1-9,]*in <LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> [\sa-zA-Z<>/1-9,]*school|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> [\sa-zA-Z<>/1-9,]*college|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> [\sa-zA-Z<>/1-9,]*university|<LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*) is home|([A-Za-z]*), <LOCATION>([A-Za-z\s]*)</LOCATION> is home|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> in|<LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION> is home|based in <LOCATION>([A-Za-z\s]*)</LOCATION>, <LOCATION>([A-Za-z\s]*)</LOCATION>|located in <LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*)|in <LOCATION>([A-Za-z\s]*)</LOCATION>, ([A-Za-z]*)"
        elif relation == "year_of_birth":
            pattern = "<PERSON>([\.A-Za-z\s\']*)</PERSON>[\sa-zA-Z<>/1-9,]*(was|is) born|<PERSON>([\.A-Za-z\s\']*)</PERSON>'s birthday|birthday of [\sa-zA-Z1-9,]*<PERSON>([\.A-Za-z\s\']*)</PERSON>"
        elif relation == "year_of_death":
            pattern = "<PERSON>([\.A-Za-z\s\']*)</PERSON>[\sa-zA-Z<>/1-9,]*died|<PERSON>([\.A-Za-z\s\']*)</PERSON>[\sa-zA-Z<>/1-9,]*passed"
        if len(pattern) > 0:
            answer = self.getMatch(clue, pattern, relation)
            if answer == "no match" and relation != "mayor_of" and relation != "univ_in":
                stop_index = clue.find("</")
                start_index = clue.find(">")
                answer = clue[start_index + 1:stop_index]
            if answer == "no match" and clue.count("</") > 1 and (
                    relation == "mayor_of" or relation == "univ_in"):
                stop1 = clue.find("</")
                stop2 = clue.find("</", stop1 + 2)
                start1 = clue.find(">")
                temp = clue.find(">", start1 + 1)
                start2 = clue.find(">", temp + 1)
                answer = clue[start1 + 1:stop1] + ", " + clue[start2 + 1:stop2]
            return answer
        return 'Gene Autry'

    def parseClues(self, clues):
        """Parse each clue and return a list of parses, one for each clue."""
        parses = []
        for clue in clues:
            feature = self.feature_extractor(clue)
            clue_relation = self.classifer.classify(feature)
            clue_entity = self.getEntity(clue_relation, clue)
            #     if clue_entity == "no match":
            #        print(clue + " " + clue_relation)
            parses.append(clue_relation + ':' + clue_entity)
        return parses

    #### You should not need to change anything after this point. ####

    def evaluate(self, parsed_clues, gold_parsed_clues):
        """Shows how the ClueParser model will score on the training/development data."""
        correct_relations = 0
        correct_parses = 0
        for parsed_clue, gold_parsed_clue in it.izip(parsed_clues,
                                                     gold_parsed_clues):
            split_parsed_clue = parsed_clue.split(":")
            split_gold_parsed_clue = gold_parsed_clue.split(":")
            if split_parsed_clue[0] == split_gold_parsed_clue[0]:
                correct_relations += 1
                if (split_parsed_clue[1] == split_gold_parsed_clue[1]
                        or split_parsed_clue[1]
                        == "The " + split_gold_parsed_clue[1]
                        or split_parsed_clue[1]
                        == "the " + split_gold_parsed_clue[1]):
                    correct_parses += 1
        print "Correct Relations: %d/%d" % (correct_relations,
                                            len(gold_parsed_clues))
        print "Correct Full Parses: %d/%d" % (correct_parses,
                                              len(gold_parsed_clues))
        print "Total Score: %d/%d" % (correct_relations + correct_parses,
                                      2 * len(gold_parsed_clues))
Ejemplo n.º 6
0
class Naive_Bayes_Clasiffier:
    def __init__(self, top=None):
        '''This class configures and populates the toplevel window.
           top is the toplevel containing window.'''
        _bgcolor = '#d9d9d9'  # X11 color: 'gray85'
        _fgcolor = '#000000'  # X11 color: 'black'
        _compcolor = '#d9d9d9'  # X11 color: 'gray85'
        _ana1color = '#d9d9d9'  # X11 color: 'gray85'
        _ana2color = '#ececec'  # Closest X11 color: 'gray92'
        font11 = "-family {Segoe UI} -size 30 -weight bold -slant "  \
            "roman -underline 0 -overstrike 0"
        self.style = ttk.Style()
        if sys.platform == "win32":
            self.style.theme_use('winnative')
        self.style.configure('.', background=_bgcolor)
        self.style.configure('.', foreground=_fgcolor)
        self.style.configure('.', font="TkDefaultFont")
        self.style.map('.',
                       background=[('selected', _compcolor),
                                   ('active', _ana2color)])

        top.geometry("500x280+428+169")
        top.title("Naive Bayes Classifer")
        top.configure(background="#3e5d93")

        self.title_naiveBayes = tk.Label(top)
        self.title_naiveBayes.place(relx=0.08, rely=0.0, height=60, width=416)
        self.title_naiveBayes.configure(activebackground="#f0f0f0")
        self.title_naiveBayes.configure(activeforeground="white")
        self.title_naiveBayes.configure(background="#3e5d93")
        self.title_naiveBayes.configure(disabledforeground="#a3a3a3")
        self.title_naiveBayes.configure(font=font11)
        self.title_naiveBayes.configure(foreground="white")
        self.title_naiveBayes.configure(text='''Naive Bayes Classifier''')

        self.directory_frame = tk.LabelFrame(top)
        self.directory_frame.place(relx=0.06,
                                   rely=0.214,
                                   relheight=0.232,
                                   relwidth=0.88)
        self.directory_frame.configure(relief='groove')
        self.directory_frame.configure(foreground="white")
        self.directory_frame.configure(text='''Directory Path''')
        self.directory_frame.configure(background="#3e5d93")
        self.directory_frame.configure(width=440)

        self.Browse_button = ttk.Button(top)
        self.Browse_button.place(relx=0.71, rely=0.304, height=25, width=96)
        self.Browse_button.configure(takefocus="")
        self.Browse_button.configure(text='''Browse''')
        self.Browse_button.configure(width=96)
        self.Browse_button.configure(cursor="fleur")
        self.Browse_button.configure(command=self.folderBrowseAction)

        self.directory_textField = ttk.Entry(top)
        self.directory_textField.place(relx=0.08,
                                       rely=0.304,
                                       relheight=0.075,
                                       relwidth=0.592)
        self.directory_textField.configure(width=296)
        self.directory_textField.configure(takefocus="")
        self.directory_textField.configure(cursor="ibeam")
        self.directory_textField.bind("<FocusOut>",
                                      self.valuesCheckButtonAbillity)

        self.Build_button = ttk.Button(top)
        self.Build_button.place(relx=0.61, rely=0.5, height=55, width=166)
        self.Build_button.configure(takefocus="")
        self.Build_button.configure(text='''Build''')
        self.Build_button.configure(width=166)
        self.Build_button.configure(state='disable')
        self.Build_button.configure(command=self.startBuild)

        self.Dicritezation_frame = tk.LabelFrame(top)
        self.Dicritezation_frame.place(relx=0.06,
                                       rely=0.5,
                                       relheight=0.232,
                                       relwidth=0.32)
        self.Dicritezation_frame.configure(relief='groove')
        self.Dicritezation_frame.configure(foreground="white")
        self.Dicritezation_frame.configure(text='''Discretization Bins''')
        self.Dicritezation_frame.configure(background="#3e5d93")
        self.Dicritezation_frame.configure(width=160)

        self.bins_textField = ttk.Entry(top)
        self.bins_textField.place(relx=0.08,
                                  rely=0.589,
                                  relheight=0.075,
                                  relwidth=0.252)
        self.bins_textField.configure(takefocus="")
        self.bins_textField.configure(cursor="ibeam")
        self.bins_textField.bind("<Key>", self.bindListenerVal)

        self.Classify_button = ttk.Button(top)
        self.Classify_button.place(relx=0.61, rely=0.714, height=55, width=166)
        self.Classify_button.configure(takefocus="")
        self.Classify_button.configure(text='''Classify''')
        self.Classify_button.configure(width=166)
        self.Classify_button.configure(command=self.startClassify)
        self.Classify_button.configure(state='disable')

        self.inputBindAlret = ttk.Label(top)
        self.inputBindAlret.place(relx=0.05, rely=0.75, height=35, width=202)
        self.inputBindAlret.configure(background="#3e5d93")
        self.inputBindAlret.configure(foreground="#3e5d93")
        self.inputBindAlret.configure(relief='flat')
        self.inputBindAlret.configure(
            text=
            '''The bins number is not valid!\nplease enter only positive number'''
        )
        self.inputBindAlret.configure(width=202)

        self.bindValOk = False
        self.directoryValOk = False

        self.menubar = tk.Menu(top,
                               font="TkMenuFont",
                               bg=_bgcolor,
                               fg=_fgcolor)
        top.configure(menu=self.menubar)

    def valuesCheckButtonAbillity(self, event):
        if (os.path.isdir(self.directory_textField.get()) == False):
            self.directoryValOk = False
            if (len(self.directory_textField.get()) != 0):
                messagebox.showerror('oops!',
                                     'Please insert a valid Directory path!')
                self.directory_textField.delete(0, 'end')

        else:
            if ((os.path.exists(self.directory_textField.get() + "/train.csv")
                 == False) or (os.path.exists(self.directory_textField.get() +
                                              "/test.csv") == False)
                    or (os.path.exists(self.directory_textField.get() +
                                       "/Structure.txt") == False)):
                self.directoryValOk = False
                if (len(self.directory_textField.get()) != 0):
                    messagebox.showerror(
                        'oops!',
                        '~~ MISSING FILES ~~\n\nMake sure that the files:\ntrain.csv,\ntest.csv\nStructure.txt \nare exists in this path!'
                    )
            else:
                self.directoryValOk = True
                if (self.bindValOk == True):
                    self.Build_button.configure(state='normal')
                    self.NB = NaiveBayes()

    def startBuild(self):
        try:
            self.NB.build(self.directory_textField.get(),
                          self.bins_textField.get())
            if (str(len(self.NB.train_Data)) == 0
                    or str(len(self.NB.test_Data)) == 0
                    or str(len(self.NB.attributes)) == 0):
                messagebox.showinfo(
                    "OOPS!",
                    "~~ EMPTY FILES ~~\nOne of the Files is Empty!\nThe algorithm cannot run like this!\nCheck it and click again"
                )
            else:
                messagebox.showinfo(
                    "Update From Build",
                    "Building classifier using train-set is done!")
                self.Classify_button.configure(state='normal')

        except:
            messagebox.showerror(
                "Crash!",
                "Something went worng on the algorithm, please click again! ")

    def startClassify(self):
        self.NB.classify()
        messagebox.showinfo(
            "All Done",
            "It's Done! a file added to your directory with the answers! ")
        sys.exit()

    def folderBrowseAction(self):
        dirWind = tk.Tk()
        dirWind.withdraw()
        path = askdirectory()
        if (len(str(self.directory_textField.get())) != 0):
            self.directory_textField.delete(0, 'end')
        self.directory_textField.insert(0, str(path))
        dirWind.destroy()
        if (os.path.isdir(self.directory_textField.get()) == False):
            self.directoryValOk = False
            if (len(self.directory_textField.get()) != 0):
                messagebox.showerror('oops!',
                                     'Please insert a valid Directory path!')
                self.directory_textField.delete(0, 'end')

        else:
            if ((os.path.exists(self.directory_textField.get() + "/train.csv")
                 == False) or (os.path.exists(self.directory_textField.get() +
                                              "/test.csv") == False)
                    or (os.path.exists(self.directory_textField.get() +
                                       "/Structure.txt") == False)):
                self.directoryValOk = False
                if (len(self.directory_textField.get()) != 0):
                    messagebox.showerror(
                        'oops!',
                        '~~ MISSING FILES ~~\n\nMake sure that the files:\ntrain.csv,\ntest.csv\nStructure.txt \nare exists in this path!'
                    )
            else:
                self.directoryValOk = True
                if (self.bindValOk == True):
                    self.Build_button.configure(state='normal')
                    self.NB = NaiveBayes()

    def bindListenerVal(self, event):
        #(not str(event.char).isdigit()
        if (event.keycode == 8):
            #print("reves")
            self.word = (str(self.bins_textField.get()))[:-1]
        else:
            try:
                self.word = str(self.bins_textField.get() + str(event.char))
            except:
                self.word = str(self.bins_textField.get())

        #print("shit:" + self.word)
        #print("shit:" + str(len(self.word)))
        if ((self.word.isdigit() and int(self.word) > 0)
                or (len(self.word)
                    == 0)):  # and len(str(self.bins_textField.get())) == 0)):
            self.inputBindAlret.configure(foreground="#3e5d93")
            if ((self.word.isdigit())):
                self.bindValOk = True
                if (self.directoryValOk == True):
                    self.Build_button.configure(state='normal')
                    self.NB = NaiveBayes()
            else:
                self.bindValOk = False
                self.Build_button.configure(state='disable')

        else:
            self.inputBindAlret.configure(foreground="#ffffffffffff")
            self.bindValOk = False

            self.Build_button.configure(state='disable')
Ejemplo n.º 7
0
class ClueParser:
    def __init__(self):
        self.classifier = NaiveBayes()
        pass

    def findEntities(self, clue):
        entities = []
        matches = re.findall(location_finder, clue)
        for match in matches:
            location = match[0] + ", " + match[1]
            entities.append(location)
        matches = re.findall(entity_finder, clue)
        for match in matches:
            entities.append(match)
        return entities

    def findCapEntity(self, clue):
        matches = re.findall(capital_finder, clue)
        if len(matches) != 0:
            return " ".join(matches)
        else:
            return ""

    def parseClues(self, clues):
        """Parse each clue and return a list of parses, one for each clue."""
        parses = []
        for clue in clues:
            klass = self.classifier.classify(self.findFeatures(clue))
            entities = self.findEntities(clue)
            parse = klass + ":"
            if len(entities) != 0:
                parse += entities[-1]
            else:
                parse += self.findCapEntity(clue)
            parses.append(parse)
        return parses

    def findFeatures(self, clue):
        words = []
        start = 0
        i = 0
        matches = re.findall(capital_remover, clue)
        sent = []
        for match in matches:
            sent.append(match[0])
        clue = "".join(sent)
        while i < len(clue):
            if clue[i] == "<":
                print clue
            if (clue[i] == " " or i == len(clue) - 1) and i != start:
                word = clue[start:i]
                if word[0] == " ":
                    word = word[1:]
                words.append(word)
                start = i + 1
            i += 1
        return words

    def train(self, clues, parsed_clues):
        """Trains the model on clues paired with gold standard parses."""
        klasses = []
        for answer in parsed_clues:
            klass = answer[:answer.index(":")]
            klasses.append(klass)
        features = []
        for clue in clues:
            features.append(self.findFeatures(clue))
        self.classifier.addExamples(features, klasses)

    def evaluate(self, parsed_clues, gold_parsed_clues):
        """Shows how the ClueParser model will score on the training/development data."""
        correct_relations = 0
        correct_parses = 0
        for parsed_clue, gold_parsed_clue in it.izip(parsed_clues, gold_parsed_clues):
            split_parsed_clue = parsed_clue.split(":")
            split_gold_parsed_clue = gold_parsed_clue.split(":")
            if split_parsed_clue[0] == split_gold_parsed_clue[0]:
                correct_relations += 1
                if (split_parsed_clue[1] == split_gold_parsed_clue[1] or
                        split_parsed_clue[1] == "The " + split_gold_parsed_clue[1] or
                        split_parsed_clue[1] == "the " + split_gold_parsed_clue[1]):
                    correct_parses += 1
        print "Correct Relations: %d/%d" % (correct_relations, len(gold_parsed_clues))
        print "Correct Full Parses: %d/%d" % (correct_parses, len(gold_parsed_clues))
        print "Total Score: %d/%d" % (correct_relations + correct_parses, 2 * len(gold_parsed_clues))
Ejemplo n.º 8
0
  ))
tests.append(getFeatures(
"""
Are there any plans to provide access to sample or previous exam papers?
 
I would be interested in knowing the format and style of the questions.

"""
  ))
tests.append(getFeatures(
"""
I'd say, it all depends on your implementation. If your implementation is very tuned to perform well on the dev queries, it may or may not perform well on the other set(as it's tuned for the given queries). If it's not, then it most likely will produce results in the same ball park. I'd suggest to close all loopholes (if there are any) in your code. Try running it for half the queries and tune it, then run it for the whole set and compare and retune.

"""
  ))


getdata_test ("select body,created_at,forum_uid,sk from EdxForum.contents",nb)


for test in tests:
  x=nb.classify(test)
  print test
  print x






Ejemplo n.º 9
0
def pipeline(fileName, nameForFiles, protectedAttribute, trueLabels, feldman,
             bayes, dataName):
    # Load data into DataSet
    ds = DataSet()
    ds.loadData(fileName, protectedAttribute, trueLabels)

    # Open a file for writing results
    f = open("results/" + nameForFiles + ".txt", "w")

    print("Starting DI detection")
    DIresult = detectDI(ds)
    f.write("DI results on original data: " + DIresult)

    # Feldman repair algorithm
    currDataSet = ds
    if feldman == "yes":
        print("Starting Feldman")
        repair = RepairData()
        repair.runRepair(ds.fileName,
                         ds.protectedAttribute,
                         ds.trueLabels,
                         dataName,
                         noiseScale=.01)
        # Pickle the Feldman-repaired data
        repair.dataSetCopy.savePickle("pickledObjects/repairedData/" +
                                      nameForFiles)
        repair.dataSetCopy.saveToCsv("dataCSVs/repairedData/" + nameForFiles +
                                     ".csv")
        currDataSet = repair.dataSetCopy

        print("Starting post-Feldman DI detection")
        postFeldmanDIresult = detectDI(repair.dataSetCopy)
        f.write("DI results after Feldman: " + postFeldmanDIresult)

    #Split data into test and training set
    currDataSet.splitIntoTrainTest()
    print("Split into test train")

    #Bayes
    if bayes == "naive":
        print("Starting Naive Bayes")
        bayesObject = NaiveBayes()
        bayesObject.train(currDataSet, bayesObject.model)
        bayesObject.classify(currDataSet, "test")
        print("Completed Naive Bayes")

    elif bayes == "modified":
        bayesObject = ModifiedBayes()
        bayesObject.train(currDataSet, 1)
        bayesObject.classify(currDataSet, "test")

    else:
        bayesObject = TwoBayes()
        bayesObject.train(currDataSet, 1)
        bayesObject.classify(currDataSet, "test")

    currDataSet.savePickle("pickledObjects/classifiedData/" + nameForFiles)
    currDataSet.saveToCsv("dataCSVs/classifiedData/" + nameForFiles + ".csv")

    # Metrics
    print("Starting metrics")
    metrics = Metrics()
    metrics.runAllMetrics(f, currDataSet, bayes, bayesObject)
    print("Completed metrics")

    f.close()
Ejemplo n.º 10
0
def main(argv):

	trainfile = ''
	testfile  = ''
	mode      = ''

	# validate input
	if len(sys.argv) == 4:
		trainfile = sys.argv[1]
		testfile  = sys.argv[2]
		mode      = sys.argv[3]
	else:
		print("incorrect input supplied")
		sys.exit()


	# ingest data
	trainset = readFile(trainfile)
	testset  = readFile(testfile)

	# y1 = 0
	# y2 = 0
	# for instance in trainset.instances:
	# 	if instance[-1] == trainset.labels[0]:
	# 		y1 +=1
	# 	else:
	# 		y2 +=1
	
	if mode == "n":
		print trainset.attributeValues
		print trainset.labels[0], y1
		print trainset.labels[1], y2

		bayes = NaiveBayes(trainset, testset)
		bayes.train(trainset.instances)
		#print bayes.yCounts
		#print bayes.xGivenYCounts[trainset.labels[0]]['bl_of_lymph_c'].values()
		#print bayes.xGivenYCounts[trainset.labels[1]]['bl_of_lymph_c'].values()

		preds = bayes.classify(testset.instances)

		corCount = 0
		for i in range(len(preds)):
			print preds[i][0], testset.instances[i][-1], preds[i][1]
			if preds[i][0] == testset.instances[i][-1]:
				corCount += 1

		print corCount
	
	if mode == "t":
		tan = TAN(trainset, trainset)
		edges = tan.initializeGraph()
		prim = tan.growPrim(edges)
		
		tan.setParentList(prim[1])
		for attrib in trainset.attributes:
			if tan.parentList[attrib]:
				print attrib, tan.parentList[attrib][0], 'class'
			else:
				print attrib, 'class'
		preds = tan.classify(testset.instances)
		print ''
		corCount = 0
		for i in range(len(preds)):
			print preds[i][0], testset.instances[i][-1], preds[i][1]
			if preds[i][0] == testset.instances[i][-1]:
				corCount += 1

		print ''
		print corCount