# data nodes P.add_nodes( # node for training data ("Training Data", LabeledTrainingDataNode(digits.data[:-test], digits.target[:-test])), # node for validation data ("Validation Data", LabeledTestDataNode(digits.data[-test:], digits.target[-test:])), ) # create nodes using scikit-learn's support vector classification # over logarithmic range of C from 0.01 to 1000 for exp in range(-1, 3): # use one vs all node for multi-classification # note use of node naming P.add_node("SVM %s" % 10 ** exp, SklearnOneVsRestNode(svm.SVC, C=10 ** exp)) # create node for scoring the classifers P.add_node("Metrics", MetricNode([accuracy_score, f1_score], verbose=True)) # construct the DAG of the pipeline by defining layers using lists # that are connected in full feedforward fashion, note the use of regexes # in order to find nodes by their name P << [["Training Data", "Validation Data"], "SVM\d*", "Metrics"] P << ["Validation Data", "Metrics"] # Dag Representation: # ----> SVM 0.1 ----> # Training Data ----> SVM 1 ----> Metrics <-------| # Validation Data ----> SVM 10 ----> | # | ----> SVM 1000 ----> |
# node for training data ("Training Data", LabeledTrainingDataNode(ft["pca.*"][:-test], digits.target[:-test])), # node for validation data ("Validation Data", LabeledTestDataNode(ft["pca.*"][-test:], digits.target[-test:])), ) # <codecell> # create nodes using scikit-learn's support vector classification # over logarithmic range of C from 0.01 to 1000 for exp in range(-1, 3): # use one vs all node for multi-classification # note use of node naming P.add_node("Machine SVM %s" % 10 ** exp, SklearnOneVsRestNode(svm.SVC, C=10 ** exp)) # scikit's K-nearest neighbors and K-means algorithms P.add_node("Machine KNN" , SklearnOneVsRestNode(neighbors.KNeighborsClassifier,warn_on_equidistant=False)) P.add_node("Machine K-means", EstimatorNode(cluster.KMeans,n_clusters=2)) # mass evaluator P.add_node("Evaluator", MachineEvaluatorNode()) # <codecell> # create cross validation, 3-fold P.add_node("Cross Validation", SklearnCrossValidationNode(accuracy_score, score_weight=-1, verbose=True,top_k=4)) P.add_node("Visualize CV", CrossValidationVisualizationNode()) # see scores # final metric