def __init__(self, text=None, filename=None): # if text is None, initialize using filename # if filename is None, initialize using text # if both are None, throw exception # text, filename are of type String if text == None: if filename == None: print "File not found: " + filename else: self.text = self.strip(Util.open_file(filename).read()) else: self.text = self.strip(text)
def learn(self, dir, easy): # in this sequence, labeled documents are provided # dir is the filename of the directory containing the labeled documents # in labeled documents, the first line is the heuristic and the second is the text # if easy is True, we allow the program to learn based on the correct heuristic # if easy is False, the analyzer will assume its guess is correct # get total number of files in directory num_files = sum(os.path.isfile(f) for f in glob.glob(dir + "/*")) # tracker to store number of correct guesses so far total_correct = 0 # total correctly formatted files so far total_files = 0 # document names should be formatted as "n.txt", where n goes from 0 to num_files - 1 for i in range(num_files): # filename fn = dir + "/" + str(i) + ".txt" # open the new file f = Util.open_file(fn) # if the file is not None (i.e. opening the file was successful) if f: # calculate correct and guessed heuristics correct_heuristic = f.readline()[:-1] text = f.readline() guessed_heuristic = self.analyze(None, text) # token to show user whether program guessed correctly or not was_correct = "N" # if the two are equal... if correct_heuristic == guessed_heuristic: # change token to yes was_correct = "Y" # increment total correct total_correct += 1 # increment total files total_files += 1 # add new document to dictionaries if easy: self.add(Document(None, fn), correct_heuristic, True) else: self.add(Document(None, fn), guessed_heuristic, True) # print results print "File " + str(i) + ": " + was_correct + ". " + str(total_correct) + "/" + str(total_files)
def __init__(self, seed=None): # if no seed is provided, return an error if seed == None: print "Initializing an Analyzer object requires a seed. Please try again." return # initialize an empty dictionary dict = {} # ---USER FEEDBACK--- print "\nAttempting to read seed file..." # open the seed file f = Util.open_file(seed) # if the file doesn't exist, return if not f: print "Initialization failed; please verify that the seed exists then try again." return # begin reading while True : # read heuristic line line = f.readline() # quit if end of file if not line : break # store new heuristic current_heuristic = Util.strip(line) # ---USER FEEDBACK--- print "Reading files for heuristic \'" + current_heuristic + "\'..." # read filenames next_line = f.readline() # if there isn't another line, quit - incorrect syntax if not next_line : print ("Incorrect seed structure. Exiting") sys.exit() # try to store number of files for this heuristic try: num_files = int(next_line) # if an exception is thrown... except ValueError: # print out an error and return nothing print "Seed file is of incorrect format. Please try again." return # create document array variable docs = [] # iterate over files for i in range(num_files) : # try to open the file filename = current_heuristic + "/" + str(i) + ".txt" new_doc = Document(None, filename) # if the new document's text is successfuly initialized... if new_doc.text : # add it to the array docs.append(new_doc) # add new heuristic and docs to dict dict[current_heuristic] = docs # store dictionary self.dict = dict # ---USER FEEDBACK--- print "Done reading files!\n" # calculate required values # initialize dictionaries self.word_counts = {} self.log_values = {} # analyze for each heuristic found for key in self.dict: self.consolidate(key) self.transform(key, True) # ---USER FEEDBACK--- print "Analyzer object initialized!\n"