def test_naive_bayes(hypothises): fp = FileProcessor(test_filepath, ' ') parsed_lines = fp.get_lines_as_array() results = [] for row in parsed_lines: exclude_label = row[1:] max_sum = -float('Inf') max_label = -1 for label in label_mappings: label_instance = hypothises[label] log_prior = log(label_instance.get_prior(), 2) densities = label_instance.get_densities() log_sum = 0 for word in exclude_label: log_sum += log(densities[word], 2) cur_sum = log_sum + log_prior if cur_sum > max_sum: max_sum = cur_sum max_label = label results.append(label_mappings[max_label]) fp.generate_output(output_filepath, results)
def process_data(training_file, testing_file, training_output, testing_output, attribute_descriptors): delimiter = ',' training_file_processor = FileProcessor(training_file, delimiter) testing_file_processor = FileProcessor(testing_file, delimiter) training_lines = training_file_processor.get_lines_as_array() testing_lines = testing_file_processor.get_lines_as_array() all_lines = training_lines + testing_lines knn_processor = KNNProcess(all_lines, attribute_descriptors) imputed_lines = map(lambda line_no: knn_processor.replace_missing_line(line_no), range(0, len(all_lines))) normalized_lines = map(lambda line: knn_processor.normalize_line(line), imputed_lines) for line_no, line in enumerate(normalized_lines[:len(training_lines)]): training_file_processor.set_line(line_no, line) for line_no, line in enumerate(normalized_lines[len(training_lines):]): testing_file_processor.set_line(line_no, line) if training_file_processor.generate_output(training_output) and testing_file_processor.generate_output(testing_output): print 'Success!'
def generate_labels(): fp = FileProcessor(testing_data_filepath, ' ') rows = fp.parse_input_file() expected = [] for row in rows: expected.append(row[0]) if fp.generate_output(labels_output_filepath, expected): return True
def test_logistic_regression(w): if not generate_labels(): return fp = FileProcessor(testing_data_filepath, ' ') rows = fp.parse_input_file() output = [] expected = [] labels = get_labels() for row in rows: expected.append(row[0]) row = row[1:] sum_val = w[0] for feature in row: feature_id = int(feature.split(':')[0]) sum_val += w[feature_id] if sigmoid(sum_val) >= 0.5: output.append(labels[0]) else: output.append(labels[1]) if fp.generate_output(output_filepath, output): print 'Successfully generated predictions.lr'