def main(): ### Read arguments if len(sys.argv) != 4: print(USAGE % sys.argv[0]) testing_folder = sys.argv[1] (spam_folder, ham_folder) = sys.argv[2:4] ### Learn the distributions print("Training...") file_lists = [] for folder in (spam_folder, ham_folder): file_lists.append(util.get_files_in_folder(folder)) (log_probabilities_by_category, log_priors_by_category) = \ learn_distributions(file_lists) #print(log_probabilities_by_category) # Here, columns and rows are indexed by 0 = 'spam' and 1 = 'ham' # rows correspond to true label, columns correspond to guessed label performance_measures = np.zeros([2,2]) ### Classify and measure performance print("Testing...") idx = 1 for filename in (util.get_files_in_folder(testing_folder)): print(idx) print(filename) idx += 1 ## Classify label = classify_email(filename, log_probabilities_by_category, log_priors_by_category) ## Measure performance # Use the filename to determine the true label base = os.path.basename(filename) true_index = ('ham' in base) guessed_index = (label == 'ham') performance_measures[true_index, guessed_index] += 1 # Uncomment this line to see which files your classifier # gets right/wrong: #print("%s : %s" %(label, filename)) template="You correctly classified %d out of %d spam emails, and %d out of %d ham emails." # Correct counts are on the diagonal correct = np.diag(performance_measures) # totals are obtained by summing across guessed labels totals = np.sum(performance_measures, 1) print(template % (correct[0], totals[0], correct[1], totals[1]))
def SplitMultipageTiff(directory): for i in get_files_in_folder(directory): if i.endswith('.tif'): outfile = i.replace('.tif', '-%d.tif') args = [ 'convert', i, outfile ] subprocess.call(args, shell=True)
plt.ylabel('Logistic Regression Cost') plt.show() return theta if __name__ == '__main__': ### Read arguments if len(sys.argv) != 4: print(USAGE % sys.argv[0]) testing_folder = sys.argv[1] (spam_folder, ham_folder) = sys.argv[2:4] file_lists = [] for folder in (spam_folder, ham_folder): if NUM_EXAMPLES > 0: file_lists.append(util.get_files_in_folder(folder)[:NUM_EXAMPLES]) else: file_lists.append(util.get_files_in_folder(folder)) print("Extracting Features and Training...") theta, all_words = train_logistic(file_lists) # # Here, columns and rows are indexed by 0 = 'spam' and 1 = 'ham' # # rows correspond to true label, columns correspond to guessed label performance_measures = np.zeros([2, 2]) ### Classify and measure performance print("Testing...") idx = 1 for filename in (util.get_files_in_folder(testing_folder)): idx += 1 ## Classify
posterior = np.array(log_likelihoods) + np.array(log_prior_by_category) winner = np.argmax(posterior) return names[winner] if __name__ == '__main__': ### Read arguments if len(sys.argv) != 4: print USAGE % sys.argv[0] testing_folder = sys.argv[1] (spam_folder, ham_folder) = sys.argv[2:4] ### Learn the distributions print("Training...") file_lists = [] for folder in (spam_folder, ham_folder): file_lists.append(util.get_files_in_folder(folder)) (log_probabilities_by_category, log_priors_by_category) = \ learn_distributions(file_lists) # Here, columns and rows are indexed by 0 = 'spam' and 1 = 'ham' # rows correspond to true label, columns correspond to guessed label performance_measures = np.zeros([2,2]) ### Classify and measure performance print("Testing...") for filename in (util.get_files_in_folder(testing_folder)): ## Classify label = classify_message(filename, log_probabilities_by_category, log_priors_by_category, ['spam', 'ham'])
sum_log_prob_given_spam += math.log(p_d.get(word, 0.5)) sum_log_prob_given_ham += math.log(q_d.get(word, 0.5)) result = ("spam", "ham")[sum_log_prob_given_spam/sum_log_prob_given_ham > 0.99] return (result, [sum_log_prob_given_spam, sum_log_prob_given_ham]) if __name__ == '__main__': spam_folder = "data/spam" ham_folder = "data/ham" test_folder = "data/testing" # Get training data file_lists = [] for folder in (spam_folder, ham_folder): file_lists.append(util.get_files_in_folder(folder)) # Learn the distributions probabilities_by_category = learn_distributions(file_lists) # prior class distribution priors_by_category = [0.5, 0.5] # Store the classification results performance_measures = np.zeros([2,2]) # Explanation of performance_measures: # columns and rows are indexed by 0 = 'spam' and 1 = 'ham' # rows correspond to true label, columns correspond to guessed label # to be more clear, performance_measures = [[p1 p2] # [p3 p4]]
import sys import subprocess from util import get_files_in_folder directory = sys.argv[1] for i in get_files_in_folder(directory): if i.endswith('.tif'): outfile = i.replace('.tif', '-%d.tif') args = ['convert', i, outfile] subprocess.call(args, shell=True)