예제 #1
0
def test_naive_bayes(hypothises):
    fp = FileProcessor(test_filepath, ' ')
    parsed_lines = fp.get_lines_as_array()
    results = []
    for row in parsed_lines:
        exclude_label = row[1:]
        max_sum = -float('Inf')
        max_label = -1

        for label in label_mappings:
            label_instance = hypothises[label]
            log_prior = log(label_instance.get_prior(), 2)
            densities = label_instance.get_densities()
            log_sum = 0

            for word in exclude_label:
                log_sum += log(densities[word], 2)
                    
            cur_sum = log_sum + log_prior
            if cur_sum > max_sum:
                max_sum = cur_sum
                max_label = label

        results.append(label_mappings[max_label])

    fp.generate_output(output_filepath, results)
예제 #2
0
def process_data(training_file, testing_file, training_output, testing_output, attribute_descriptors):
    delimiter = ','
    training_file_processor = FileProcessor(training_file, delimiter)
    testing_file_processor = FileProcessor(testing_file, delimiter)
    training_lines = training_file_processor.get_lines_as_array()
    testing_lines = testing_file_processor.get_lines_as_array()
    all_lines = training_lines + testing_lines
    knn_processor = KNNProcess(all_lines, attribute_descriptors)
    imputed_lines = map(lambda line_no: knn_processor.replace_missing_line(line_no), range(0, len(all_lines)))
    normalized_lines = map(lambda line: knn_processor.normalize_line(line), imputed_lines)
    for line_no, line in enumerate(normalized_lines[:len(training_lines)]):
        training_file_processor.set_line(line_no, line)
    for line_no, line in enumerate(normalized_lines[len(training_lines):]):
        testing_file_processor.set_line(line_no, line)
    if training_file_processor.generate_output(training_output) and testing_file_processor.generate_output(testing_output):
        print 'Success!'
예제 #3
0
def generate_labels():
        fp = FileProcessor(testing_data_filepath, ' ')
        rows = fp.parse_input_file()
        expected = []
        for row in rows:
                expected.append(row[0])
        
        if fp.generate_output(labels_output_filepath, expected):
                return True
예제 #4
0
def test_logistic_regression(w):
        if not generate_labels():
                return
        fp = FileProcessor(testing_data_filepath, ' ')
        rows = fp.parse_input_file()
        output = []
        expected = []
        labels = get_labels()
        
        for row in rows:
             expected.append(row[0])
             row = row[1:]
             sum_val = w[0]
             for feature in row:
                     feature_id = int(feature.split(':')[0])
                     sum_val += w[feature_id]
             
             if sigmoid(sum_val) >= 0.5:
                     output.append(labels[0])
             else:
                     output.append(labels[1])
        
        if fp.generate_output(output_filepath, output):
                print 'Successfully generated predictions.lr'