def main(domainxml, trainingsetcsv, manifold_value, restrictionstxt):
    restrictions = dataset.restrictions_from_text(restrictionstxt)

    cols, data = dataset.read(trainingsetcsv.read(), True, restrictions)
    expected, actual, expected_hunked, actual_hunked = sampling.cross_validate(data, list(cols), manifold_value)
    print("Overall confusion matrix:")
    print(sampling.confusion_matrix(expected, actual))

    print("\nOverall recall:")
    print(sampling.recall(expected, actual, "Obama"))

    print("\nOverall precision:")
    print(sampling.precision(expected, actual, "Obama"))

    print("\nOverall pf:")
    print(sampling.pf(expected, actual, "Obama"))

    print("\nOverall f-measure:")
    print(sampling.f_measure(expected, actual, "Obama"))

    print("\nOverall accuracy:")
    print(sampling.accuracy(expected, actual))

    print("\nAverage accuracy:")
    print(sum(sampling.accuracy(e, a) for e, a in zip(expected_hunked, actual_hunked)) / len(expected_hunked))

    print("\nOverall error rate:")
    print(sampling.error_rate(expected, actual))

    print("\nAverage error rate:")
    print(sum(sampling.error_rate(e, a) for e, a in zip(expected_hunked, actual_hunked)) / len(expected_hunked))
def main(domainxml, trainingsetcsv, restrictionstxt):
    restrictions = dataset.restrictions_from_text(restrictionstxt)

    cols, data = dataset.read(trainingsetcsv.read(), restrictions)
    # call train function with:
    #   `col_sets` - list of sets per column, NOT including class label
    #   `data` (list of ([train data], class))
    tree = Node("swole", ("true", Label("protein and starches")),
            ("false", Label("sugar"))) # dummy temp tree
    tree = c45.run(data, list(enumerate(cols)), 0)
    tree_xml = stringify_tree(tree)
    sys.stdout.buffer.write(tree_xml)
def main(to_classify_csv, decision_tree_xml, restrictionstxt, has_label_column):
    # how are we supposed to determine if this has a label column or not?
    # I guess we could look at the number of unique edge labels in decision tree
    #   to determine features/
    tree = model.build_tree(decision_tree_xml.read())
    restrictions = dataset.restrictions_from_text(restrictionstxt)
    cols, data = dataset.read(to_classify_csv.read(), has_label_column,
            restrictions)

    predicted_classes = [tree.classify(x[0], cols) for x in data]
    labels = [x[1] for x in data]
    if has_label_column:
        print('Records:', len(data))
        print('Correctly classified:',
              sum(1 for p,l in zip(predicted_classes, labels) if p==l))
        print('Incorrectly classified:',
              sum(1 for p,l in zip(predicted_classes, labels) if p!=l))
        print('Accuracy:', sampling.accuracy(labels, predicted_classes))
        print('Error:', sampling.error_rate(labels, predicted_classes))
        print('Confusion matrix:')
        print(sampling.confusion_matrix(labels, predicted_classes))
    else:
        for i in range(len(predicted_classes)):
            print(data[i][0], predicted_classes[i])