Beispiel #1
0
def get_data():
    tickets_file = csv.reader(open('2012-10-09.close.csv'))

    time_format = '%Y-%m-%d %H:%M:%S'
    tickets = []
    times = []
    reporters = []
    subjects = []

    for number, created, changetime, closetime, reporter, summary, status, \
            owner, tkt_type, component, description in tickets_file:
        row = []
        created = dt.datetime.strptime(created, time_format)
        closetime = dt.datetime.strptime(closetime, time_format)
        changetime = dt.datetime.strptime(changetime, time_format)
        time_to_fix = closetime - created

        row.append(float(number))
        row.append(float(time.mktime(created.timetuple())))

        tickets.append(row)
        times.append(total_seconds(time_to_fix))
        reporters.append(reporter)
        subjects.append(summary)

    scaler = preprocessing.Scaler().fit(np.array(tickets))
    tickets = sp.csr_matrix(scaler.transform(tickets))
    tickets = sp.hstack((tickets, TfidfTransformer().fit_transform(
        CountVectorizer().fit_transform(reporters))))
    tickets = sp.hstack((tickets, TfidfTransformer().fit_transform(
        CountVectorizer(ngram_range=(1, 3)).fit_transform(subjects))))

    scaler = preprocessing.Scaler(with_mean=False).fit(tickets)
    tickets = scaler.transform(tickets)
    return tickets, times
Beispiel #2
0
def train_ensemble_adjective_classifier(train_feature_objects, adjective,
                                        classifiers, scalers,
                                        feature_name_list):
    '''
    Given build classifier of 5 motions - train the single joined classifier

    Returns a single classifier with its associated scaler
    
    IMPORTANT: the feature vector of probabilities needs to be created in a specific order
    'tap', 'squeeze', 'thermal_hold', 'slide', 'slide_fast'

    '''

    # Pull out the features
    probability_vector, probability_labels, object_ids, weights = build_ensemble_feature_vector(
        train_feature_objects, adjective, classifiers, scalers,
        feature_name_list)

    # Create scaler for the features
    scaler = preprocessing.Scaler().fit(probability_vector)

    # Train a single SVM
    svm = train_svm(probability_vector, probability_labels, object_ids)

    return (svm, scaler, weights)
Beispiel #3
0
def get_data():
    tickets_file = csv.reader(open('2012-10-09.close.csv'))

    tickets = []
    times = []
    time_format = '%Y-%m-%d %H:%M:%S'

    for number, created, changetime, closetime, reporter, summary, status, \
            owner, tkt_type, component, description in tickets_file:
        row = []
        created = dt.datetime.strptime(created, time_format)
        closetime = dt.datetime.strptime(closetime, time_format)
        changetime = dt.datetime.strptime(changetime, time_format)
        time_to_fix = closetime - created

        row.append(float(number))
        row.append(float(time.mktime(created.timetuple())))

        tickets.append(row)
        times.append(total_seconds(time_to_fix))

    scaler = preprocessing.Scaler().fit(np.array(tickets))
    tickets = scaler.transform(tickets)

    return tickets, times
def full_ensemble_train(train_feature_vector_dict, train_adjective_dict,
                        test_feature_vector_dict, test_adjective_dict):
    """
    """

    # Open text file for storing classification reports
    ensemble_report_file = open("Full_Ensemble_Report.txt", "w")

    all_ensemble_classifiers = dict()

    # For all adjectives
    for adj in train_adjective_dict:

        # Create ensemble scaler
        scaler = preprocessing.Scaler().fit(train_feature_vector_dict[adj])

        # Run SVM
        ensemble_svm, ensemble_proba, ensemble_score, ensemble_report = train_svm(
            train_feature_vector_dict[adj],
            train_adjective_dict[adj],
            test_feature_vector_dict[adj],
            test_adjective_dict[adj],
            scaler,
            cv_flag=False)

        all_ensemble_classifiers[adj] = ensemble_svm

        # Write classification reports into text file
        ensemble_report_file.write('Adjective:  ' + adj + '\n')
        ensemble_report_file.write(ensemble_report)
        ensemble_report_file.write('\n\n')

        return all_ensemble_classifiers
Beispiel #5
0
def create_and_save_scaler(data):
    """Create a scaler for the given data and save it to the disk."""
    scaler = preprocessing.Scaler().fit(data)
    create_classifier_dir()
    joblib.dump(
        scaler,
        os.path.join(classifiers_dir, scaler.__class__.__name__ + '.pkl'))
    return scaler
def create_scalers(train_feature_vector_dict):
    """
    Takes in the training feature vector dictionary, generates a scaler for each motion,
    and then returns the scalers
    """
    scaler_dict = dict()
    for motion_name in train_feature_vector_dict:
        scaler_dict[motion_name] = preprocessing.Scaler().fit(
            train_feature_vector_dict[motion_name][0])

    return scaler_dict
Beispiel #7
0
 def init_classifier(self, filename):
     """Unpickle svm training data, train classifier"""
     with open(filename, 'rb') as f:
         svm_data = pickle.load(f)
     labels = svm_data['labels']
     data = svm_data['data']
     scaler = pps.Scaler().fit(data)
     data_scaled = scaler.transform(data)
     classifier = svm.SVC()
     classifier.fit(data_scaled, labels)
     return (scaler, classifier)
Beispiel #8
0
    def est_gradient_decsent(self):
        iris = datasets.load_iris()
        X = iris.data

        scaler = pre.Scaler()
        X = scaler.fit_transform(X)

        y = self.all_to_sparse(iris.target, max(iris.target) + 1)
        X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets(
            np.array(X), np.array(y))
        thetas, costs, val_costs = neur.gradient_decent(
            np.array(X), np.array(y), np.array(X_val), np.array(y_val))
Beispiel #9
0
def train_weak_classifier_adjective(train_feature_objects, adjective,
                                    feature_dictionary):
    '''
    takes in a dictionary of all features
    
    returns a dictionary of weak classifiers for each feature
    '''

    # specify feature to be extracted
    feature_name_list = [
        "pdc_rise_count", "pdc_area", "pdc_max", "pac_energy", "pac_sc",
        "pac_sv", "pac_ss", "pac_sk", "tac_area", "tdc_exp_fit", "gripper_min",
        "gripper_mean", "transform_distance", "electrode_polyfit"
    ]

    # store weak svms
    svm_motion_store = dict()

    # store scalers
    scaler_motion_store = dict()

    # store scalers
    classifiers = dict()

    # for each motion (slide, squeeze, etc.)
    for motion in train_feature_objects:
        motion_train_set = train_feature_objects[motion]

        # pull out the features specified as a vector
        train_feature_vector, train_label_dict = utilities.feature_obj_2_feature_vector(
            motion_train_set, feature_name_list)

        # create scaler
        scaler_motion_store[motion] = preprocessing.Scaler().fit(
            train_feature_vector)
        train_feature_vector_scaled = scaler_motion_store[motion].transform(
            train_feature_vector)
        params = {
            'n_estimators': 1000,
            'max_depth': 4,
            'min_samples_split': 1,
            'learn_rate': 0.01,
            'loss': 'deviance'
        }

        #params = {'n_estimators': 1, 'max_depth': 1, 'min_samples_split': 1,'learn_rate': 0.1, 'loss': 'deviance'}
        clf = ensemble.GradientBoostingClassifier(**params)
        clf.fit(train_feature_vector_scaled, train_label_dict[1][adjective])
        #clf = train_gradient_boost(train_feature_vector_scaled, train_label_dict[1][adjective], train_label_dict[0])

        classifiers[motion] = clf

    return (classifiers, scaler_motion_store)
Beispiel #10
0
 def __init__(self, product_cluster_center, category_cluster_center, 
              product_cluster_50pc_dist, product_cluster_80pc_dist, 
              category_cluster_50pc_dist, category_cluster_80pc_dist,
              scaler_mean, scaler_std):
     '''
     Constructor
     '''
     self.clusters_properties = {"prod": [product_cluster_center, product_cluster_50pc_dist, product_cluster_80pc_dist],
                                "cat":  [category_cluster_center, category_cluster_50pc_dist, category_cluster_80pc_dist]}
     
     scaler = preprocessing.Scaler()
     scaler.mean_ = scaler_mean                                 
     scaler.std_ = scaler_std
     self.scaler = scaler
Beispiel #11
0
def mode_pca():
    rows, head = load_rows()
    X, Y, x_head, train, test = rows_to_predictor_response(rows, head)
    
    pca = decomposition.PCA(n_components=13)
    re_pipeline = pipeline.Pipeline([ ('scaler',preprocessing.Scaler()), ('pca',pca) ])
    pc = re_pipeline.fit_transform(X[train])
    
    churn = Y[train,0] > 0.5
    for i in range(1,13):
        plt.title('PCA scores')
        plt.xlabel('pc[0]')
        plt.ylabel('pc['+str(i)+']')
        plt.plot(pc[:,0], pc[:,i], 'go')
        plt.plot(pc[churn,0], pc[churn,i], 'ro')
        plt.savefig("out/churn_scores_0_"+str(i)+".png")
        plt.cla()

    loadings = pca.components_
    for i in range(1,13):
        plt.title('PCA loadings')
        plt.xlabel('pc[0]')
        plt.ylabel('pc['+str(i)+']')
        plt.plot(loadings[0], loadings[i], 'go')
        for j,l in enumerate(loadings[[0,i]].T): 
            plt.annotate(x_head[j],l); #-(j%3)*0.02
            #print x_head[i],l
        plt.savefig("out/churn_loadings_0_"+str(i)+".png")
        plt.cla()
    
    y_scaler = preprocessing.Scaler(with_std=False)
    linre = linear_model.LinearRegression()
    linre.fit(X=pc, y=y_scaler.fit_transform(Y[train,0]))
   
    y_pred = y_scaler.inverse_transform( linre.predict( re_pipeline.transform(X[test]) ) )
    
    plot_rocs('pca','b-',Y[test,0],y_pred)
 def __prepare(self,features_lists):
     new_features_lists = []
     for fl in features_lists:
         nfl = [float(fl[0]), float(fl[1]),
                float(fl[2]), float(fl[3]),
                float(fl[8]), float(fl[9]),
                float(fl[10]),float(fl[11])]
         new_features_lists.append(nfl)
         
     scaler = preprocessing.Scaler().fit(new_features_lists)
     print "mean", scaler.mean_                                      
     print "std", scaler.std_ 
     
     self.scaler = scaler             
     
     return scaler.transform(new_features_lists), scaler.mean_, scaler.std_
Beispiel #13
0
def train(docs, query2docs, label_map):
  scaler = preprocessing.Scaler().fit([extractFeatures(doc) for doc in docs]) # scaler.transform will standardize the data
  
  X_train = []
  y_train = []
  for query in query2docs:
    qdocs = query2docs[query]
    features = scaler.transform([extractFeatures(doc) for doc in qdocs])
    for i, j in itertools.permutations(range(len(qdocs)), 2):
      doc_i = qdocs[i]
      doc_j = qdocs[j]
      label = cmp(label_map[doc_i.query][doc_i.url], label_map[doc_j.query][doc_j.url])
      if label != 0:
        X_train.append(vec_difference(features[i], features[j]))
        y_train.append(label)
  model = svm.SVC(kernel='linear', C=3.0).fit(X_train, y_train)
  return scaler, model
Beispiel #14
0
def basic_gradient_descent():
    digits = datasets.load_digits()
    # iris = datasets.load_iris()
    X = digits.images.reshape((digits.images.shape[0], -1))

    scaler = pre.Scaler()
    X = scaler.fit_transform(X)

    y = ut.all_to_sparse(digits.target, max(digits.target) + 1)
    X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets(
        np.array(X), np.array(y), "basic_grad_descent_digits")
    X_val = np.vstack([X_val, X_test])
    y_val = np.vstack([y_val, y_test])

    thetas, costs, val_costs = neur.gradient_decent_gen(
        izip(neur.mini_batch_generator(X, 10),
             neur.mini_batch_generator(y, 10)),
        #hidden_layer_sz = 11,
        hidden_layer_sz=100,
        iter=1000,
        wd_coef=0.0,
        learning_rate=0.1,
        momentum_multiplier=0.9,
        rand_init_epsilon=0.012,
        do_early_stopping=True,
        #do_dropout = True,
        #dropout_percentage = 0.8,
        #do_learning_adapt = True,
        X_val=np.array(X_val),
        y_val=np.array(y_val))
    h_x, a = neur.forward_prop(X_test, thetas)
    binary_result = ut.map_to_max_binary_result(h_x)
    print "percentage correct predictions: ", ut.percent_equal(
        binary_result, y_test)
    print "training error:", costs[-1:][0]
    print "validation error:", val_costs[-1:][0]
    print "lowest validation error:", min(val_costs)
    plt.plot(costs, label='cost')
    plt.plot(val_costs, label='val cost')
    plt.legend()
    plt.ylabel('error rate')
    plt.show()
Beispiel #15
0
def basic_gradient_descent():
    digits = datasets.load_digits()
    # iris = datasets.load_iris()
    X = digits.images.reshape((digits.images.shape[0], -1))

    scaler = pre.Scaler()
    X = scaler.fit_transform(X)

    y = ut.all_to_sparse(digits.target, max(digits.target) + 1)
    X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets(
        gpu.as_garray(X), gpu.as_garray(y), "digits")
    X_val = gpu.concatenate([X_val, X_test])
    y_val = gpu.concatenate([y_val, y_test])
    thetas, costs, val_costs = neur.gradient_decent(
        gpu.as_garray(X),
        gpu.as_garray(y),
        #hidden_layer_sz = 11,
        hidden_layer_sz=45,
        iter=500,
        wd_coef=0.0,
        learning_rate=0.25,
        momentum_multiplier=0.9,
        rand_init_epsilon=0.012,
        do_early_stopping=True,
        #do_dropout = True,
        dropout_percentage=0.7,
        #do_learning_adapt = True,
        X_val=gpu.as_garray(X_val),
        y_val=gpu.as_garray(y_val))
    h_x, a = neur.forward_prop(X_test, thetas)
    h_x = map(lambda x: x.as_numpy_array(), h_x)
    print "percentage correct predictions: ", ut.percent_equal(
        ut.map_to_max_binary_result(h_x), y_test.as_numpy_array())
    print "training error:", costs[-1:][0]
    print "validation error:", val_costs[-1:][0]
    print "lowest validation error:", min(val_costs)
    plt.plot(costs, label='cost')
    plt.plot(val_costs, label='val cost')
    plt.legend()
    plt.ylabel('error rate')
Beispiel #16
0
def main():
    dat = pd.read_table('data/train_v2.csv', sep=',')
    print "reading done, train"
    loss = np.asarray(dat.loss)
    dat = dat.drop(['loss', 'id'], 1)
    dat['new1'] = dat['f528'] - dat['f527']  #golden feature 1
    dat['new2'] = dat['f528'] - dat['f274']  #golden feature 2
    dat = np.asarray(dat.values, dtype=float)
    col_med = stats.nanmedian(dat, axis=0)
    print "calculated medians, train"
    inds = np.where(np.isnan(dat))
    dat[inds] = np.take(col_med, inds[1])
    print "median imputation done, train"
    scaler = preprocessing.Scaler().fit(dat)
    dat = scaler.transform(dat)
    print "scaling done, train"
    labels = (loss > 0).astype(int)
    np.save('data/x_train.npy', dat)
    np.save('data/y_train.npy', labels)
    np.save('data/loss.npy', loss)
    print "trainset done"

    dat = pd.read_table('data/test_v2.csv', sep=',')
    print "reading done, test"
    ids = np.asarray(dat.id)
    dat = dat.drop(['id'], 1)
    dat['new1'] = dat['f528'] - dat['f527']  #golden feature 1
    dat['new2'] = dat['f528'] - dat['f274']  #golden feature 2
    dat = np.asarray(dat.values, dtype=float)
    col_med = stats.nanmedian(dat, axis=0)
    print "calculated medians, test"
    inds = np.where(np.isnan(dat))
    dat[inds] = np.take(col_med, inds[1])
    print "imputation done, test"
    dat = scaler.transform(dat)
    print "scaling done, test"
    np.save('data/x_test.npy', dat)
    np.save('data/ids.npy', ids)
    print "testset done"
Beispiel #17
0
def basic_iris():
    iris = datasets.load_iris()

    scaler = pre.Scaler()
    X = scaler.fit_transform(iris.data)

    y = ut.all_to_sparse(iris.target, max(iris.target) + 1)
    X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets(
        np.array(X), np.array(y), "iris")
    X_val = np.vstack([X_val, X_test])
    y_val = np.vstack([y_val, y_test])
    thetas, costs, val_costs = neur.gradient_decent(
        np.array(X),
        np.array(y),
        #hidden_layer_sz = 11,
        hidden_layer_sz=20,
        iter=8000,
        wd_coef=0.0,
        learning_rate=0.07,
        momentum_multiplier=0.3,
        rand_init_epsilon=0.12,
        do_early_stopping=True,
        #do_dropout = True,
        dropout_percentage=0.9,
        do_learning_adapt=True,
        X_val=np.array(X_val),
        y_val=np.array(y_val))
    h_x, a = neur.forward_prop(X_test, thetas)
    print "percentage correct predictions: ", ut.percent_equal(
        ut.map_to_max_binary_result(h_x), y_test)
    print "training error:", costs[-1:][0]
    print "validation error:", val_costs[-1:][0]
    print "lowest validation error:", min(val_costs)
    plt.plot(costs, label='cost')
    plt.plot(val_costs, label='val cost')
    plt.legend()
    plt.ylabel('error rate')
    plt.show()
Beispiel #18
0
def train_weak_classifier_motion(motion_train_set, adjective,
                                 feature_dictionary):
    '''
    Takes the feature_object_train_set and trains the specified
    feature.

    Will return a single trained SVM
    '''

    # Store SVM for each feature
    svm_store = dict()
    scaler_store = dict()

    import pdb
    pdb.set_trace()
    # For each feature set (pdc, pac, etc.)
    for feature in feature_dictionary:

        # Pull out the list of features
        feature_list = feature_dictionary[feature]

        # Pull out the features specified as a vector
        train_feature_vector, train_label_dict = utilities.feature_obj_2_feature_vector(
            motion_train_set, feature_list)

        # Create scaler
        scaler_store[feature] = preprocessing.Scaler().fit(
            train_feature_vector)
        train_feature_vector_scaled = scaler_store[feature].transform(
            train_feature_vector)

        # Train the SVM
        svm_store[feature] = train_svm(train_feature_vector_scaled,
                                       train_label_dict[1][adjective],
                                       train_label_dict[0])

    return (svm_store, scaler_store)
Beispiel #19
0
 def train_scaler(self, data):
     '''Work out the mean and variance of the samples'''
     from sklearn import preprocessing
     scaler = preprocessing.Scaler().fit(data.images)
     self.transform = scaler.transform
Beispiel #20
0
def basic_gradient_descent():
    data = np.genfromtxt('./stack_data_wide_val.csv', delimiter=',')
    X = data[:, :-1]
    y = data[:, -1:]

    scaler = pre.Scaler()
    X_val = scaler.fit_transform(X)

    y_val = np.array(map(lambda x: [0, 1] if x == 0 else [1, 0], y.flatten()))

    #X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets(np.array(X), np.array(y), "basic_kaggle_data", True)
    #X_val = np.vstack([X_val, X_test])
    #y_val = np.vstack([y_val, y_test])

    hid_layer = 300

    mg = neur.split_xy(neur.mini_batch_gen_from_file(
        'stack_data_wide_train.csv', 40),
                       -1,
                       apply_x=lambda x: scaler.transform(x.astype(float)),
                       apply_y=lambda y: np.array(
                           map(lambda x: [0, 1]
                               if x == 0 else [1, 0], y.flatten())))

    #bm = rbm.RBM(13408, hid_layer)
    #costs = bm.optimize(neur.just_x(mg), 1000, 0.0007, val_set = X_val)

    #first_layer_weights = np.hstack([np.zeros((hid_layer,1)), bm.weights])
    #thetas  = neur.create_initial_thetas([64, hid_layer, 2], 0.12)
    #thetas[0] =  first_layer_weights

    # best so far minibatchsize 40 hidden layer 100 learning rate 0.01

    thetas, costs, val_costs = neur.gradient_decent_gen(
        mg,
        #hidden_layer_sz = 11,
        hidden_layer_sz=hid_layer,
        iter=20000,
        wd_coef=0.0,
        learning_rate=0.01,
        #thetas = thetas,
        momentum_multiplier=0.9,
        rand_init_epsilon=0.0012,
        do_early_stopping=True,
        #do_dropout = True,
        #dropout_percentage = 0.5,
        #do_learning_adapt = True,
        X_val=np.array(X_val),
        y_val=np.array(y_val))
    h_x, a = neur.forward_prop(X_val, thetas)
    binary_result = ut.map_to_max_binary_result(h_x)
    print "percentage correct predictions: ", ut.percent_equal(
        binary_result, y_val)
    print "training error:", costs[-1:][0]
    print "validation error:", val_costs[-1:][0]
    print "lowest validation error:", min(val_costs)
    plt.plot(costs, label='cost')
    plt.plot(val_costs, label='val cost')
    plt.legend()
    plt.ylabel('error rate')
    plt.show()
Beispiel #21
0
def run_stack(SEED):

	model = "Long-Lat KNN5 - 50 Features"

	print "Running GB, RF, ET stack."

	trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_50.csv", skipFirstLine = False, split = "\t")
	test = csv_io.read_data("PreProcessData/test_PreProcess4_50.csv", skipFirstLine = False, split = "\t")
	weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False)

	
	#random.seed(SEED)
	#random.shuffle(trainBase)
	
	avg = 0
	NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10).


	predicted_list = []
	bootstrapLists = []

	# use this for quick runs.
	# note RF with 150 crashes on 30 features
	# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	# GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
	# RandomForestRegressor(n_estimators=100, n_jobs=1),
	#RandomForestRegressor(n_estimators=75, n_jobs=1),
	# clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1),
		# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False)
		# ]	
	#knn 5 at 3.45
	#knn 15 at 3.31
	#knn 25 at 3.30
	#knn 40 at 3.31
	# KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
	# LinearRegression at 3.77
	# Ridge at 3.77
	# SGD 4.23
	#Gauss at 13
	# LinearRegression(fit_intercept=True, normalize=False, copy_X=True),
	# Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001),
	# SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False),
	# GaussianNB()
	# clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),
		 # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2)	
		# ]
		
	# GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ******************
	# clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166),
			# GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166)
			# ]	
			
	# about 1 hour run time, and 3.10 score.		
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.05
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166)
	# about 2 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166)
	# about 4 hours run time at 3.06
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166)	
	
	clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166)
			]		
	
	
		# use this for quick runs.
	# clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'),
			# RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)]	
	
	
	
	# use this for quick runs.  reduced estimators to 50
	# clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
        # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True,
        # tol=0.001, verbose=False)
			# ]	
			
	#GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
	#ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1)
	
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)]
			
			
	# full algorithm stack.
	# clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8),
			# GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			# GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200),
			# GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280),			
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320),
			# GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4),
			# RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5),
			# RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6),
			# RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7),
			# RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)]
	

	
	print "Data size: ", len(trainBase), len(test)
	dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
	dataset_blend_test = np.zeros((len(test), len(clfs)))
	

	trainNew = []
	trainTestNew = []
	testNew = []
	trainNewSelect = []
	trainTestNewSelect = []
	testNewSelect = []
	
	print "Scaling"
	targetPre = [x[0] for x in trainBase]
	trainPre = [x[1:] for x in trainBase]
	testPre = [x[0:] for x in test]
	#print trainPre[0]
	scaler = preprocessing.Scaler().fit(trainPre)
	trainScaled = scaler.transform(trainPre)
	testScaled = scaler.transform(testPre)	

	#print scaler.mean_
	#print scaler.std_
	print "Begin Training"
	
	
	for ExecutionIndex, clf in enumerate(clfs):
		print str(clf)
		avg = 0
	
		predicted_list = []
			
		dataset_blend_test_set = np.zeros((len(test), NumFolds))

		
		foldCount = 0

		
		#Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))]
		Folds = cross_validation.KFold(len(trainBase), k=NumFolds, indices=True)
		for train_index, test_index in Folds:

			#trainBaseTemp = [trainBase[i] for i in train_index]
			#target = [x[0] for x in trainBaseTemp]
			#train = [x[1:] for x in trainBaseTemp]
	
			#testBaseTemp = [trainBase[i] for i in test_index]
			#targetTest = [x[0] for x in testBaseTemp]
			#trainTest = [x[1:] for x in testBaseTemp]
		
			#test = [x[0:] for x in test]
	
			target = [targetPre[i] for i in train_index]
			train = [trainScaled[i] for i in train_index]
			
			targetTest = [targetPre[i] for i in test_index]	
			trainTest = [trainScaled[i] for i in test_index]	
	
			print
			print "Iteration: ", foldCount
			print "LEN: ", len(train), len(target)
			
			clf.fit(train, target)
			prob = clf.predict(trainTest) 
			
			dataset_blend_train[test_index, ExecutionIndex] = prob



	
			probSum = 0
			weightSum = 0
			# totalOffByHalf = 0
			# totalPositive = 0
			# totalPositiveOffByHalf = 0
			# totalPositivePredictions = 0
			
			for i in range(0, len(prob)):
				probX = prob[i]

				probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX)
				weightSum += weights[test_index[i]][0] 
				#print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX
				
				# log loss cal
				#probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX)
				# if ( math.fabs(probX - int(targetTest[i])) > 0.5 ):
					# totalOffByHalf = totalOffByHalf + 1		
			
				# if ( int(targetTest[i]) == 1 ):
					# totalPositive = totalPositive + 1
				# if ( int(targetTest[i]) == 1 and probX < 0.5):
					# totalPositiveOffByHalf = totalPositiveOffByHalf + 1
				# if (probX > 0.5):
					# totalPositivePredictions = totalPositivePredictions + 1			
			
			# print
			# print "Stats:"
			# print "Total Off By > 0.5 ", totalOffByHalf
			# print "Total Positive ", totalPositive
			# print "Total Positive Off By Half ", totalPositiveOffByHalf
			# print "Total Positive Predictions ", totalPositivePredictions
			#print -probSum/len(prob)
			print "Score: ", probSum/weightSum
 
			avg += 	(probSum/weightSum)/NumFolds

			predicted_probs = clf.predict(testScaled) 	
			#predicted_list.append([x[1] for x in predicted_probs])	
			dataset_blend_test_set[:, foldCount] = predicted_probs #[0]
		
				
			foldCount = foldCount + 1
		
		dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1)  
		
		#print "Saving NP"
		#np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set)
		#np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) )
		#np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test)
		#print "Done Saving NP"
		
		now = datetime.datetime.now()
		#print dataset_blend_test_set.mean(1) 
		csv_io.write_delimited_file_single("../predictions_50/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))
		
		csv_io.write_delimited_file_single("../predictions_50/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )		
		
		csv_io.write_delimited_file("../predictions_40/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",")
		
		
		print now
		print "------------------------Average: ", avg

		#np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

	return dataset_blend_train, dataset_blend_test
Beispiel #22
0
import pickle
import numpy as np
from scipy import interp
import pylab as pl

from sklearn import preprocessing as pps, svm
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold, LeaveOneOut

with open('../data/svm_data.pkl', 'rb') as f:
    svm_data = pickle.load(f)
labels = svm_data['labels']
data = svm_data['data']

scaler = pps.Scaler().fit(data)
print "Mean: ", scaler.mean_
print "Std: ", scaler.std_
data_scaled = scaler.transform(data)

classifier = svm.SVC(probability=True)
classifier.fit(data_scaled, labels)

#print "Support Vectors: \r\n", classifier.support_vectors_
print "SV's per class: \r\n", classifier.n_support_


###############################################################################
## Code below modified from http://scikit-learn.org/stable/auto_examples/plot_roc_crossval.html#example-plot-roc-crossval-py
X, y = data_scaled, np.array(labels)
n_samples, n_features = X.shape
Beispiel #23
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--spec', help='training spec yaml file')
    parser.add_argument('--viz', action='store_true', help='just visualize')
    parser.add_argument('--no_normalize',
                        action='store_false',
                        help='do not normalize data by stdev')
    args = parser.parse_args()

    # load spec and feature value matrix
    spec = None
    with open(args.spec, 'r') as specf:
        spec = yaml.load(specf)
    feature_data = {}
    for feature_name in spec['features']:
        filename = osp.join(osp.dirname(args.spec),
                            spec['features'][feature_name]['data'])
        with open(filename, 'r') as feature_data_file:
            feature_data[feature_name] = np.load(feature_data_file)['mat']

    feature_names = sorted(feature_data.keys())

    # create data points with labels
    LABEL_SAME = 1
    LABEL_DIFFERENT = -1
    # data has rows [feature_0 feature_1 label]
    npts = feature_data.values()[0].size
    data = np.zeros((npts, len(feature_names) + 1))
    for fnum, feature_name in enumerate(feature_data):
        data[:, fnum] = feature_data[feature_name].reshape((-1, 1))[:, 0]
    labels = np.empty_like(feature_data.values()[0])
    labels[:, :] = LABEL_DIFFERENT
    start = 0
    for i, cls in enumerate(sorted(spec['classes'].keys())):
        num_in_cls = len(spec['classes'][cls]['examples'])
        labels[start:start + num_in_cls, start:start + num_in_cls] = LABEL_SAME
        start += num_in_cls
    data[:, -1] = labels.reshape((-1, 1))[:, 0]

    if args.viz:
        import pylab as pl
        pl.scatter(data[:, 0],
                   data[:, 1],
                   s=30,
                   c=data[:, 2],
                   cmap=pl.cm.Paired)
        pl.xlabel(feature_data.keys()[0])
        pl.ylabel(feature_data.keys()[1])
        pl.show()

    from sklearn import neighbors, datasets, linear_model, svm, pipeline, preprocessing

    classifiers = dict(
        knn=neighbors.KNeighborsClassifier(),
        logistic=linear_model.LogisticRegression(C=1e5),
        svm=svm.SVC(C=1e5, kernel='linear'),
    )

    X = data[:, 0:2]
    Y = data[:, 2]

    trained = {}
    for name, clf in classifiers.iteritems():
        pclf = pipeline.Pipeline([('scaler', preprocessing.Scaler()),
                                  ('classifier', clf)])
        pclf.fit(X, Y)
        trained[name] = pclf

    filename = osp.join(osp.dirname(args.spec), 'classifiers.pkl')
    print 'Writing classifiers to', filename
    with open(filename, 'wb') as f:
        import cPickle
        cPickle.dump(trained, f)

    if args.viz:
        h = .01  # step size in the mesh
        import pylab as pl
        fignum = 1
        for name, pclf in trained.iteritems():
            # Plot the decision boundary. For that, we will asign a color to each
            # point in the mesh [x_min, m_max]x[y_min, y_max].
            x_min, x_max = X[:, 0].min(), X[:, 0].max()
            y_min, y_max = X[:, 1].min(), X[:, 1].max()
            xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                                 np.arange(y_min, y_max, h))
            Z = pclf.predict(np.c_[xx.ravel(), yy.ravel()])

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            #pl.figure(fignum, figsize=(6, 6))
            pl.figure(fignum)
            pl.pcolormesh(xx, yy, Z, cmap=pl.cm.Paired)

            # Plot also the training points
            pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired)
            pl.title(name)
            pl.xlabel(feature_data.keys()[0])
            pl.ylabel(feature_data.keys()[1])
            fignum += 1
        pl.show()
Beispiel #24
0
# Get data
data = fetch_sdss_sspp(cleaned=True)
X = np.vstack([data['FeH'], data['alphFe']]).T

# truncate dataset for speed
X = X[::5]

#------------------------------------------------------------
# Compute a 2D histogram  of the input
H, FeH_bins, alphFe_bins = np.histogram2d(data['FeH'], data['alphFe'], 50)

#------------------------------------------------------------
# Compute the KMeans clustering
n_clusters = 4

scaler = preprocessing.Scaler()
clf = KMeans(n_clusters)
clf.fit(scaler.fit_transform(X))

#------------------------------------------------------------
# Visualize the results
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot()

# plot density
ax = plt.axes()
ax.imshow(H.T,
          origin='lower',
          interpolation='nearest',
          aspect='auto',
          extent=[FeH_bins[0], FeH_bins[-1], alphFe_bins[0], alphFe_bins[-1]],
Beispiel #25
0
def PreProcess4(N_Features):

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv",
                                 skipFirstLine=False,
                                 split="\t")
    test = csv_io.read_data("PreProcessData/test_PreProcess3.csv",
                            skipFirstLine=False,
                            split="\t")
    shutil.copy2("PreProcessData/DataClassList3.csv",
                 "PreProcessData/DataClassList4.csv")

    target = [x[0] for x in trainBase]
    train = [x[1:] for x in trainBase]

    DataClassList = csv_io.read_data("PreProcessData/DataClassList4.csv",
                                     False)

    print "Data len: ", len(train[0])
    print "DataClassList len: ", len(DataClassList)
    #return

    # this seems about optimal, but has not been tuned on latest improvements.
    NumFeatures = N_Features
    # NOTE going from 30 to 20 features on KNN5 set has almost no effect.  Down to 15 is significant loss.
    # for GBM at 6 and 400 30 is 3.01 and 30 3.05.

    print "Scaling"
    term = 5000  #  scaler has memory errors between 5000 and 10000
    #term = len(trainBase)
    targetPre = [x[0] for x in trainBase][0:term]
    trainPre = [x[1:] for x in trainBase][0:term]
    #testPre = [x[0:] for x in test][0:term]
    targetPre = target[0:term]
    #print trainPre[term - 1]
    scaler = preprocessing.Scaler().fit(trainPre)
    trainScaled = scaler.transform(trainPre)
    #testScaled = scaler.transform(testPre)

    #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True)
    clf = GradientBoostingRegressor(loss='ls',
                                    learn_rate=0.05,
                                    subsample=0.5,
                                    max_depth=6,
                                    n_estimators=400,
                                    random_state=166,
                                    min_samples_leaf=30)

    print "Training"

    clf.fit(trainScaled, targetPre)

    trainNew = []
    testNew = []

    print "Computing Importances"
    importances = clf.feature_importances_

    DataClassListNew = []
    for DataIndex, DataClass in enumerate(DataClassList):
        print DataClass[0], importances[DataIndex]
        DataClassListNew.append([DataClass[0], importances[DataIndex]])

    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_" + str(NumFeatures) +
        ".csv", DataClassListNew)

    DataClassListNew_temp = sorted(DataClassListNew,
                                   key=operator.itemgetter(1),
                                   reverse=True)
    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_sorted_" + str(NumFeatures) +
        ".csv", DataClassListNew_temp)

    importancesTemp = sorted(importances, reverse=True)
    print len(importancesTemp), "importances"

    if (len(importancesTemp) > NumFeatures):
        threshold = importancesTemp[NumFeatures]

        print "Importance threshold: ", threshold

        rowIndex = 0
        for row in train:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (impIndex == 0):
                    newRow.append(target[rowIndex])
                if (importance > threshold):
                    newRow.append(row[impIndex])
            trainNew.append(newRow)
            rowIndex += 1

        for row in test:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    newRow.append(row[impIndex])
            testNew.append(newRow)

    csv_io.write_delimited_file("PreProcessData/training_PreProcess4_" +
                                str(NumFeatures) + ".csv",
                                trainNew,
                                delimiter="\t")
    csv_io.write_delimited_file("PreProcessData/test_PreProcess4_" +
                                str(NumFeatures) + ".csv",
                                testNew,
                                delimiter="\t")
Beispiel #26
0
def PreProcess4():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv",
                                 skipFirstLine=False,
                                 split="\t")
    test = csv_io.read_data("PreProcessData/test_PreProcess2.csv",
                            skipFirstLine=False,
                            split="\t")
    shutil.copy2("PreProcessData/DataClassList2.csv",
                 "PreProcessData/DataClassList4Base.csv")

    target = [x[0] for x in trainBase]
    train = [x[1:] for x in trainBase]

    DataClassList = csv_io.read_data("PreProcessData/DataClassList4Base.csv",
                                     False)

    print "Data len: ", len(train[0])
    print "DataClassList len: ", len(DataClassList)
    #return

    # this seems about optimal, but has not been tuned on latest improvements.
    NumFeatures = 40
    # NOTE going from 30 to 20 features on KNN5 set has almost no effect.  Down to 15 is significant loss.
    # for GBM at 6 and 400 30 is 3.01 and 30 3.05.

    print "Scaling"
    targetPre = [x[0] for x in trainBase]
    trainPre = [x[1:] for x in trainBase]
    testPre = [x[0:] for x in test]
    #print trainPre[0]
    scaler = preprocessing.Scaler().fit(trainPre)
    trainScaled = scaler.transform(trainPre)
    #testScaled = scaler.transform(testPre)

    #clf = RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True)
    clf = RandomForestRegressor(n_estimators=25,
                                n_jobs=1,
                                compute_importances=True)
    #clf = ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True,compute_importances=True)

    print "Training"
    # producing memory errors, probably too much data.
    # recommend to use linear lasso.
    #est = LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
    #selector = RFE(est, 20, step=10)
    #selector = selector.fit(trainScaled, target)
    #print selector.support_
    #print selector.ranking_
    #return

    #trainPost = selector.transform(trainPre)
    #testPost = selector.transform(testPre)

    clf.fit(trainScaled, target)

    trainNew = []
    testNew = []

    print "Computing Importances"
    importances = clf.feature_importances_

    DataClassListNew = []
    for DataIndex, DataClass in enumerate(DataClassList):
        print DataClass[0], importances[DataIndex]
        DataClassListNew.append([DataClass[0], importances[DataIndex]])

    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_Base.csv", DataClassListNew)

    DataClassListNew_temp = sorted(DataClassListNew,
                                   key=operator.itemgetter(1),
                                   reverse=True)
    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_sorted_Base.csv",
        DataClassListNew_temp)

    importancesTemp = sorted(importances, reverse=True)
    print len(importancesTemp), "importances"

    if (len(importancesTemp) > NumFeatures):
        threshold = importancesTemp[NumFeatures]

        print "Importance threshold: ", threshold

        rowIndex = 0
        for row in train:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (impIndex == 0):
                    newRow.append(target[rowIndex])
                if (importance > threshold):
                    newRow.append(row[impIndex])
            trainNew.append(newRow)
            rowIndex += 1

        for row in test:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    newRow.append(row[impIndex])
            testNew.append(newRow)

    csv_io.write_delimited_file("PreProcessData/training_PreProcess4_Base.csv",
                                trainNew,
                                delimiter="\t")
    csv_io.write_delimited_file("PreProcessData/test_PreProcess4_Base.csv",
                                testNew,
                                delimiter="\t")
Beispiel #27
0
def PreProcess4():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv",
                                 skipFirstLine=False,
                                 split="\t")
    test = csv_io.read_data("PreProcessData/test_PreProcess3.csv",
                            skipFirstLine=False,
                            split="\t")
    shutil.copy2("PreProcessData/DataClassList3.csv",
                 "PreProcessData/DataClassList5_PCA.csv")

    target = [x[0] for x in trainBase]
    train = [x[1:] for x in trainBase]

    DataClassList = csv_io.read_data("PreProcessData/DataClassList5_PCA.csv",
                                     False)

    print "Data len: ", len(train[0])
    print "DataClassList len: ", len(DataClassList)

    NumFeatures = 40

    print "Scaling"
    targetPre = [x[0] for x in trainBase]
    trainPre = [x[1:] for x in trainBase]
    testPre = [x[0:] for x in test]
    #print trainPre[0]
    scaler = preprocessing.Scaler().fit(trainPre)
    trainScaled = scaler.transform(trainPre)
    testScaled = scaler.transform(testPre)

    #clf = RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True)
    #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True)
    #clf = ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True,compute_importances=True)

    clf = PCA(n_components=NumFeatures)

    print "Training"
    # producing memory errors, probably too much data.
    # recommend to use linear lasso.
    #est = LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
    #selector = RFE(est, 20, step=10)
    #selector = selector.fit(trainScaled, target)
    #print selector.support_
    #print selector.ranking_
    #return

    #trainPost = selector.transform(trainPre)
    #testPost = selector.transform(testPre)

    clf.fit(trainScaled, target)

    trainNew = []
    testNew = []

    print "Computing Importances"
    importances = clf.explained_variance_ratio_

    #DataClassListNew = []
    #for DataIndex, DataClass in enumerate(DataClassList):
    #	print DataClass[0], importances[DataIndex];
    #	DataClassListNew.append([DataClass[0], importances[DataIndex]])

    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_PCA.csv", importances)

    DataClassListNew_temp = sorted(importances,
                                   key=operator.itemgetter(1),
                                   reverse=True)
    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_sorted_PCA.csv",
        DataClassListNew_temp)

    importancesTemp = sorted(importances, reverse=True)
    print len(importancesTemp), "importances"

    trainNew = clf.transform(trainScaled)
    testNew = clf.transform(testScaled)

    #if ( len(importancesTemp) > NumFeatures):
    #	threshold = importancesTemp[NumFeatures]

    #	print "Importance threshold: ", threshold

    #	rowIndex = 0
    #	for row in train:
    #		newRow = []
    #		for impIndex, importance in enumerate(importances):
    #			if ( impIndex == 0):
    #				newRow.append(target[rowIndex])
    #			if ( importance > threshold ):
    #				newRow.append(row[impIndex])
    #		trainNew.append(newRow)
    #		rowIndex += 1

    #	for row in test:
    #		newRow = []
    #		for impIndex, importance in enumerate(importances):
    #			if ( importance > threshold ) :
    #				newRow.append(row[impIndex])
    #		testNew.append(newRow)

    csv_io.write_delimited_file("PreProcessData/training_PreProcess5_PCA.csv",
                                trainNew,
                                delimiter="\t")
    csv_io.write_delimited_file("PreProcessData/test_PreProcess5_PCA.csv",
                                testNew,
                                delimiter="\t")
Beispiel #28
0
pp = pprint.PrettyPrinter(indent=4)

y_labels = open("y.txt").read().split("\n")[0:-1]
x_labels = open("x_real.txt").read().split("\n")[0:-1]

data_x = open("DATA_X", "r")
X = cPickle.load(data_x)
data_x.close()

data_y = open("DATA_Y", "r")
Y = cPickle.load(data_y)
data_y.close()

print "Scaling"

scaler = preprocessing.Scaler().fit(X)
X = scaler.transform(X)

print "Normalising"

normaliser = preprocessing.Normalizer().fit(X)
X = normaliser.transform(X)

print "Training"

DELTAS = []

imax = COL_LIMIT if COL_LIMIT < Y.shape[1] else Y.shape[1]

for i in range(imax):
    i = 53
Beispiel #29
0
def PreProcess5():

    #note, 275 represents too much data, and the scaler fails with an exception.

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess5_250.csv",
                                 skipFirstLine=False,
                                 split="\t")
    test = csv_io.read_data("PreProcessData/test_PreProcess5_250.csv",
                            skipFirstLine=False,
                            split="\t")
    #shutil.copy2("PreProcessData/DataClassList5.csv", "PreProcessData/DataClassList6.csv")

    target = [x[0] for x in trainBase]
    train = [x[1:] for x in trainBase]

    DataClassList = csv_io.read_data(
        "PreProcessData/DataClassList_Importances_250.csv", False)

    print "Data len: ", len(train[0])
    print "DataClassList len: ", len(DataClassList)
    #return

    # this seems about optimal, but has not been tuned on latest improvements.
    NumFeatures = 40
    # NOTE going from 30 to 20 features on KNN5 set has almost no effect.  Down to 15 is significant loss.
    # for GBM at 6 and 400 30 is 3.01 and 30 3.05.

    print "Scaling"
    targetPre = [x[0] for x in trainBase][0:10000]
    print "Scaling1"
    trainPre = [x[1:] for x in trainBase][0:10000]
    #testPre = [x[0:] for x in test]
    print "Scaling2"
    scaler = preprocessing.Scaler().fit(trainPre)
    print "Scaling3"
    trainScaled = scaler.transform(trainPre)
    #testScaled = scaler.transform(testPre)

    #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True)

    #gc.collect()

    print "Prep Classes"

    # prep for usage below...
    DataClassListTemp = []
    for DataIndex, DataClass in enumerate(DataClassList):
        DataClassListTemp.append([DataClass[0], 0])

    DataClassList = DataClassListTemp

    reduceBy = 5
    totalFeatures = len(trainPre[0])

    trainNew = []
    testNew = []

    print "Processing"
    while (totalFeatures > NumFeatures):

        if (totalFeatures - NumFeatures < 40):
            reduceBy = 3
        if (totalFeatures - NumFeatures < 20):
            reduceBy = 2
        if (totalFeatures - NumFeatures < 10):
            reduceBy = 1

        if (totalFeatures - NumFeatures < reduceBy):
            reduceBy = totalFeatures - NumFeatures
            print "Reduce Features: ", reduceBy

        print "Training"
        clf = GradientBoostingRegressor(loss='ls',
                                        learn_rate=0.05,
                                        subsample=0.5,
                                        max_depth=6,
                                        n_estimators=400,
                                        random_state=166,
                                        min_samples_leaf=30)
        clf.fit(trainScaled, targetPre)

        print "Computing Importances"
        importances = clf.feature_importances_
        #print importances
        importancesSorted = sorted(importances, reverse=True)
        #print importancesSorted
        threshold = importancesSorted[len(importancesSorted) - reduceBy]
        print threshold
        #trainScaled = clf.transform(trainScaled, threshold) # only exists in RF

        trainScaledNew = []
        for row in trainScaled:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    newRow.append(row[impIndex])
            trainScaledNew.append(newRow)

        trainScaled = trainScaledNew

        print "Cols:", len(trainScaled)
        print "Rows:", len(trainScaled[0])

        totalFeatures = totalFeatures - reduceBy
        print "Total Features:", totalFeatures

        trainNew = []
        testNew = []

        for row in train:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    newRow.append(row[impIndex])
            trainNew.append(newRow)

        train = trainNew

        for row in test:
            newRow = []
            for impIndex, importance in enumerate(importances):
                if (importance > threshold):
                    newRow.append(row[impIndex])
            testNew.append(newRow)

        test = testNew

        print "Train Cols:", len(train)
        print "Train Rows:", len(train[0])

        print "Test Cols:", len(test)
        print "Test Rows:", len(test[0])

        DataClassListNew = []
        for Index, importance in enumerate(importances):
            if (importance > threshold):
                print DataClassList[Index][0], importance
                DataClassListNew.append([DataClassList[Index][0], importance])

        DataClassList = DataClassListNew

        print "Data Transform Complete"

    # final steps, save data classes in new set

    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_RFE2_" + str(NumFeatures) +
        ".csv", DataClassListNew)

    DataClassListNew_temp = sorted(DataClassListNew,
                                   key=operator.itemgetter(1),
                                   reverse=True)
    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_RFE2_sorted_" +
        str(NumFeatures) + ".csv", DataClassListNew_temp)

    # prepend the target on each row.
    trainFinal = []

    rowIndex = 0
    for row in train:
        newRow = []
        for Index, val in enumerate(row):
            if (Index == 0):
                newRow.append(target[rowIndex])
            newRow.append(val)
        trainFinal.append(newRow)
        rowIndex += 1

    csv_io.write_delimited_file("PreProcessData/training_PreProcess6_RFE2_" +
                                str(NumFeatures) + ".csv",
                                trainFinal,
                                delimiter="\t")
    csv_io.write_delimited_file("PreProcessData/test_PreProcess6_RFE2_" +
                                str(NumFeatures) + ".csv",
                                testNew,
                                delimiter="\t")
Beispiel #30
0
def PreProcess4():

    trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv",
                                 skipFirstLine=False,
                                 split="\t")
    test = csv_io.read_data("PreProcessData/test_PreProcess3.csv",
                            skipFirstLine=False,
                            split="\t")
    shutil.copy2("PreProcessData/DataClassList3.csv",
                 "PreProcessData/DataClassList4.csv")

    target = [x[0] for x in trainBase]
    train = [x[1:] for x in trainBase]

    DataClassList = csv_io.read_data("PreProcessData/DataClassList4.csv",
                                     False)

    print "Data len: ", len(train[0])
    print "DataClassList len: ", len(DataClassList)
    #return

    # this seems about optimal, but has not been tuned on latest improvements.
    NumFeatures = 40
    # NOTE going from 30 to 20 features on KNN5 set has almost no effect.  Down to 15 is significant loss.
    # for GBM at 6 and 400 30 is 3.01 and 30 3.05.

    print "Scaling"
    targetPre = [x[0] for x in trainBase]
    trainPre = [x[1:] for x in trainBase]
    testPre = [x[0:] for x in test]
    #print trainPre[0]
    scaler = preprocessing.Scaler().fit(trainPre)
    trainScaled = scaler.transform(trainPre)
    #testScaled = scaler.transform(testPre)

    clf = RandomForestRegressor(n_estimators=25,
                                n_jobs=1,
                                compute_importances=True)
    reduceBy = 5

    clf.fit(trainScaled, target)

    print "Computing Importances"
    importances = clf.feature_importances_
    print importances
    importancesSorted = sorted(importances, reverse=True)
    print importancesSorted
    threshold = importancesSorted[len(importancesSorted) - reduceBy]
    print threshold
    trainScaled = clf.transform(trainScaled, threshold)

    return

    trainNew = []
    testNew = []

    DataClassListNew = []
    for DataIndex, DataClass in enumerate(DataClassList):
        print DataClass[0], selector.ranking_[DataIndex]
        DataClassListNew.append([DataClass[0], selector.ranking_[DataIndex]])

    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_RFE_" + str(NumFeatures) +
        ".csv", DataClassListNew)

    DataClassListNew_temp = sorted(
        DataClassListNew, key=operator.itemgetter(1))  # , reverse=True
    csv_io.write_delimited_file(
        "PreProcessData/DataClassList_Importances_RFE_sorted_" +
        str(NumFeatures) + ".csv", DataClassListNew_temp)

    #importancesTemp = sorted(importances, reverse=True)
    #print len(importancesTemp), "importances"

    if (len(selector.ranking_) > NumFeatures):
        #threshold = importancesTemp[NumFeatures]

        threshold = NumFeatures
        print "Importance threshold: ", threshold

        rowIndex = 0
        for row in train:
            newRow = []
            for impIndex, importance in enumerate(selector.ranking_):
                if (impIndex == 0):
                    newRow.append(target[rowIndex])
                if (importance < threshold):
                    newRow.append(row[impIndex])
            trainNew.append(newRow)
            rowIndex += 1

        for row in test:
            newRow = []
            for impIndex, importance in enumerate(selector.ranking_):
                if (importance < threshold):
                    newRow.append(row[impIndex])
            testNew.append(newRow)

    csv_io.write_delimited_file("PreProcessData/training_PreProcess4_RFE_" +
                                str(NumFeatures) + ".csv",
                                trainNew,
                                delimiter="\t")
    csv_io.write_delimited_file("PreProcessData/test_PreProcess4_RFE_" +
                                str(NumFeatures) + ".csv",
                                testNew,
                                delimiter="\t")