def check_transformer_pickle(name, Transformer):
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        transformer = Transformer()
    if not hasattr(transformer, 'transform'):
        return
    set_random_state(transformer)
    set_fast_parameters(transformer)

    # fit
    if name in CROSS_DECOMPOSITION:
        random_state = np.random.RandomState(seed=12345)
        y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y

    transformer.fit(X, y_)
    X_pred = transformer.fit(X, y_).transform(X)
    pickled_transformer = pickle.dumps(transformer)
    unpickled_transformer = pickle.loads(pickled_transformer)
    pickled_X_pred = unpickled_transformer.transform(X)

    assert_array_almost_equal(pickled_X_pred, X_pred)
def check_classifiers_classes(name, Classifier):
    X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1
    y_names = np.array(["one", "two", "three"])[y]

    for y_names in [y_names, y_names.astype('O')]:
        if name in ["LabelPropagation", "LabelSpreading"]:
            # TODO some complication with -1 label
            y_ = y
        else:
            y_ = y_names

        classes = np.unique(y_)
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            classifier = Classifier()
        if name == 'BernoulliNB':
            classifier.set_params(binarize=X.mean())
        set_fast_parameters(classifier)
        # fit
        classifier.fit(X, y_)

        y_pred = classifier.predict(X)
        # training set performance
        assert_array_equal(np.unique(y_), np.unique(y_pred))
        if np.any(classifier.classes_ != classes):
            print("Unexpected classes_ attribute for %r: "
                  "expected %s, got %s" %
                  (classifier, classes, classifier.classes_))
def loadData(path="../data/",k=5,log='add',pca_n=0,SEED=34):
	from pandas import DataFrame, read_csv
	from numpy import log as ln
	from sklearn.cross_validation import KFold
	from sklearn.preprocessing import LabelEncoder
	from sklearn.preprocessing import StandardScaler
	train = read_csv(path+"train.csv")
	test = read_csv(path+"test.csv")
	id = test.id
	target = train.target
	encoder = LabelEncoder()
	target_nnet = encoder.fit_transform(target).astype('int32')
	feat_names = [x for x in train.columns if x.startswith('feat')]
	train = train[feat_names].astype(float)
	test = test[feat_names]
	if log == 'add':
		for v in train.columns:
			train[v+'_log'] = ln(train[v]+1)
			test[v+'_log'] = ln(test[v]+1)
	elif log == 'replace':
		for v in train.columns:
			train[v] = ln(train[v]+1)
			test[v] = ln(test[v]+1)      
	if pca_n > 0:
		from sklearn.decomposition import PCA
		pca = PCA(pca_n)
		train = pca.fit_transform(train)
		test = pca.transform(test)
	scaler = StandardScaler()
	scaler.fit(train)
	train = DataFrame(scaler.transform(train),columns=['feat_'+str(x) for x in range(train.shape[1])])
	test = DataFrame(scaler.transform(test),columns=['feat_'+str(x) for x in range(train.shape[1])])
	cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED)
	return train, test, target, target_nnet, id, cv, encoder
Example #4
0
    def transformTestData(self, train_data, test_data):
        #Select the right features for both training and testing data
        X_train, y_train = self.__selectRelevantFeatures(train_data)
        X_test, y_test = self.__selectRelevantFeatures(test_data)

        #Transform categorical variables into integer labels
        martial_le = LabelEncoder()
        occupation_le = LabelEncoder()
        relationship_le = LabelEncoder()
        race_le = LabelEncoder()
        sex_le = LabelEncoder()
        transformers = [martial_le, occupation_le, relationship_le, race_le, sex_le]

        for i in range(len(transformers)):
            X_train[:,i] = transformers[i].fit_transform(X_train[:,i])
            X_test[:,i] = transformers[i].transform(X_test[:,i])

        #Dummy code categorical variables
        dummy_code = OneHotEncoder(categorical_features = range(5))
        X_train = dummy_code.fit_transform(X_train).toarray()
        X_test = dummy_code.transform(X_test).toarray()

        #Normalize all features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        #Encode y
        class_le = LabelEncoder()
        y_train = class_le.fit_transform(y_train)
        y_test = class_le.transform(y_test)
        #print class_le.transform(["<=50K", ">50K"])

        return X_train, X_test, y_train, y_test
Example #5
0
    def clustering_approach(self):
        '''
        Cluster user data using various clustering algos
        IN: self.df_full and self.labels
        OUT: results to stdout
        '''
        print 'Fitting clustering model'
        X = self.df_full.values
        y = self.labels

        # scale data
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        # KMeans
        km_clf = KMeans(n_clusters=2, n_jobs=6)
        km_clf.fit(X)

        # swap labels as super-users are in cluster 0 (messy!!)
        temp = y.apply(lambda x: 0 if x == 1 else 1)
        print '\nKMeans clustering: '
        self.analyse_preds(temp, km_clf.labels_)

        # Agglomerative clustering
        print '\nAgglomerative clustering approach: '
        ac_clf = AgglomerativeClustering()
        ac_labels = ac_clf.fit_predict(X)
        self.analyse_preds(y, ac_labels)

        return None
def check_clustering(name, Alg):
    X, y = make_blobs(n_samples=50, random_state=1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    n_samples, n_features = X.shape
    # catch deprecation and neighbors warnings
    with warnings.catch_warnings(record=True):
        alg = Alg()
    set_fast_parameters(alg)
    if hasattr(alg, "n_clusters"):
        alg.set_params(n_clusters=3)
    set_random_state(alg)
    if name == 'AffinityPropagation':
        alg.set_params(preference=-100)
        alg.set_params(max_iter=100)

    # fit
    alg.fit(X)
    # with lists
    alg.fit(X.tolist())

    assert_equal(alg.labels_.shape, (n_samples,))
    pred = alg.labels_
    assert_greater(adjusted_rand_score(pred, y), 0.4)
    # fit another time with ``fit_predict`` and compare results
    if name is 'SpectralClustering':
        # there is no way to make Spectral clustering deterministic :(
        return
    set_random_state(alg)
    with warnings.catch_warnings(record=True):
        pred2 = alg.fit_predict(X)
    assert_array_equal(pred, pred2)
def check_transformer_general(name, Transformer):
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    _check_transformer(name, Transformer, X, y)
    _check_transformer(name, Transformer, X.tolist(), y.tolist())
Example #8
0
def cross_valid(data, classifier, x_cols, y_col, **kwargs):
	# Do train-test split for cross-validation
	size = len(data)
	kf = train_test_split(size)
	y_pred = np.zeros(size)
	y_pred_prob = np.zeros(size)
	y = data[y_col].as_matrix().astype(np.float)
	totaltime_train = 0
	totaltime_test = 0
	for train_index, test_index in kf:
		# Fill in missing values
		df = data.copy()
		df = fill_missing_median(df, train_index)
		# Transform and normalize
		X = df[x_cols].as_matrix().astype(np.float)
		scaler = StandardScaler()
		X = scaler.fit_transform(X)
		# Build classifier and yield predictions
		y_pred[test_index], y_pred_prob[test_index], train_time, test_time \
		= model(X, y, train_index, test_index, classifier, **kwargs)
		totaltime_train += train_time
		totaltime_test += test_time
	avgtime_train = train_time/len(kf)
	avgtime_test = test_time/len(kf)
	return y, y_pred, y_pred_prob, avgtime_train, avgtime_test
Example #9
0
def buildTreeRegressor(predictorColumns, structurestable = 'structures.csv',  targetcolumn = 'c_a', md = None):
    """
    Build a random forest-regressor model to predict some structure feature from compositional data.  Will return the model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values
    """
    df = pd.read_csv(structurestable)
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    s = StandardScaler()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = df[targetcolumn].values

    rfr = RandomForestRegressor(max_depth = md)
    acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error)))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfr.fit(X_train,y_train)
    y_predict = rfr.predict(X_test)
    
    t = pd.DataFrame({'True':y_test, 'Predicted':y_predict})
    
    rfr.fit(X, y)

    return rfr, t, round(acc,2)
Example #10
0
def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv',  targetcolumn = 'pointGroup', md = None):
    """
    Build a random forest-classifier model to predict some structure feature from compositional data.  Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object
    """
    df = pd.read_csv(structurestable)
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    s = StandardScaler()
    le = LabelEncoder()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = le.fit_transform(df[targetcolumn].values)

    rfc = RandomForestClassifier(max_depth = md)
    acc = mean(cross_val_score(rfc, X, y))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfc.fit(X_train,y_train)
    y_predict = rfc.predict(X_test)
    cm = confusion_matrix(y_test, y_predict)
    
    cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_)

    rfc.fit(X, y)

    return rfc, cm, round(acc,2), le
Example #11
0
def buildCoordinationTreeRegressor(predictorColumns, element, coordinationDir = 'coordination/', md = None):
    """
    Build a coordination predictor for a given element from compositional structure data of structures containing that element. Will return a model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values
    """
    try:
        df = pd.read_csv(coordinationDir + element + '.csv')
    except Exception:
        print 'No data for ' + element
        return None, None, None
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    if(len(df) < 4):
        print 'Not enough data for ' + element
        return None, None, None
    s = StandardScaler()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = df['avgCoordination'].values

    rfr = RandomForestRegressor(max_depth = md)
    acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error)))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfr.fit(X_train,y_train)
    y_predict = rfr.predict(X_test)
    
    t = pd.DataFrame({'True':y_test, 'Predicted':y_predict})
    
    rfr.fit(X, y)

    return rfr, t, round(acc,2)
Example #12
0
def main():
    
    t0 = time.time() # start time

    # output files path
    TRAINX_OUTPUT = "../../New_Features/train_x_processed.csv"
    TEST_X_OUTPUT = "../../New_Features/test__x_processed.csv"
    # input files path
    TRAIN_FILE_X1 = "../../ML_final_project/sample_train_x.csv"
    TRAIN_FILE_X2 = "../../ML_final_project/log_train.csv"
    TEST__FILE_X1 = "../../ML_final_project/sample_test_x.csv"
    TEST__FILE_X2 = "../../ML_final_project/log_test.csv"
    # load files
    TRAIN_DATA_X1 = np.loadtxt(TRAIN_FILE_X1, delimiter=',', skiprows=1, usecols=(range(1, 18)))
    TEST__DATA_X1 = np.loadtxt(TEST__FILE_X1, delimiter=',', skiprows=1, usecols=(range(1, 18)))
    TRAIN_DATA_X2 = logFileTimeCount(np.loadtxt(TRAIN_FILE_X2, delimiter=',', skiprows=1, dtype=object))
    TEST__DATA_X2 = logFileTimeCount(np.loadtxt(TEST__FILE_X2, delimiter=',', skiprows=1, dtype=object))
    # combine files
    TRAIN_DATA_X0 = np.column_stack((TRAIN_DATA_X1, TRAIN_DATA_X2))
    TEST__DATA_X0 = np.column_stack((TEST__DATA_X1, TEST__DATA_X2))
    # data preprocessing
    scaler = StandardScaler()
    TRAIN_DATA_X = scaler.fit_transform(TRAIN_DATA_X0)
    TEST__DATA_X = scaler.transform(TEST__DATA_X0)
    # output processed files
    outputXFile(TRAINX_OUTPUT, TRAIN_DATA_X)
    outputXFile(TEST_X_OUTPUT, TEST__DATA_X)

    t1 = time.time() # end time
    print "...This task costs " + str(t1 - t0) + " second."
def knn(x_train, y_train, x_valid):
    x_train=np.log(x_train+1)
    x_valid=np.log(x_valid+1)

    where_are_nan = np.isnan(x_train)
    where_are_inf = np.isinf(x_train)
    x_train[where_are_nan] = 0
    x_train[where_are_inf] = 0
    where_are_nan = np.isnan(x_valid)
    where_are_inf = np.isinf(x_valid)
    x_valid[where_are_nan] = 0
    x_valid[where_are_inf] = 0

    scale=StandardScaler()
    scale.fit(x_train)
    x_train=scale.transform(x_train)
    x_valid=scale.transform(x_valid)

    #pca = PCA(n_components=10)
    #pca.fit(x_train)
    #x_train = pca.transform(x_train)
    #x_valid = pca.transform(x_valid)

    kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1)
    knn_train, knn_test = stacking(kneighbors, x_train, y_train, x_valid, "knn")
    return knn_train, knn_test, "knn"
Example #14
0
def test_transformers_data_not_an_array():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1

    for name, Transformer in transformers:
        # XXX: some transformers are transforming the input
        # data. This is a bug that we'll fix later. Right now we copy
        # the data each time
        this_X = NotAnArray(X.copy())
        this_y = NotAnArray(np.asarray(y))
        if name in dont_test:
            continue
        # these don't actually fit the data:
        if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
            continue
        # And these wan't multivariate output
        if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'):
            continue
        yield check_transformer, name, Transformer, this_X, this_y
def load_train_data(path):
    print("Loading Train Data")
    df = pd.read_csv(path)
    
    
    # Remove line below to run locally - Be careful you need more than 8GB RAM 
    rows = np.random.choice(df.index.values, 40000)
    df = df.ix[rows]
    # df = df.sample(n=40000)
    # df = df.loc[df.index]
    
    labels = df.target

    df = df.drop('target',1)
    df = df.drop('ID',1)
    
    # Junk cols - Some feature engineering needed here
    df = df.fillna(-1)

    X = df.values.copy()
    
    np.random.shuffle(X)

    X = X.astype(np.float32)
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels).astype(np.int32)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, y, encoder, scaler
Example #16
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = Pipeline([
            ("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
                                         n_jobs=N_JOBS))])
        self.scaler = StandardScaler()
        self.agglo = FeatureAgglomeration(n_clusters=500)

    def fit(self, X, y):
        y = y.ravel()
        n_samples, n_lags, n_lats, n_lons = X.shape
        self.scaler.fit(X[:, -1].reshape(n_samples, -1))
        X = X.reshape(n_lags * n_samples, -1)
        connectivity = grid_to_graph(n_lats, n_lons)
        self.agglo.connectivity = connectivity
        X = self.scaler.transform(X)
        X = self.agglo.fit_transform(X)
        X = X.reshape(n_samples, -1)
        self.clf.fit(X, y)

    def predict(self, X):
        n_samples, n_lags, n_lats, n_lons = X.shape
        X = X.reshape(n_lags * n_samples, -1)
        X = self.scaler.transform(X)
        X = self.agglo.transform(X)
        X = X.reshape(n_samples, -1)
        return self.clf.predict(X)
Example #17
0
def linregress(X_train, X_test, y_train, y_test):
    coef = []
    for col in X_train.columns.tolist():
        X = StandardScaler().fit_transform(X_train[col])
        lr = LinearRegression()
        lr.fit(X.reshape(-1, 1), y_train)
        coef.append([col, lr.coef_])
    coef = sorted(coef, key=lambda x: x[1])[::-1]
    nos = [x[1] for x in coef]
    labs = [x[0] for x in coef]
    for lab in labs:
        if lab == 'doubles':
            labs[labs.index(lab)] = '2B'
        elif lab == 'triples':
            labs[labs.index(lab)] = '3B'
        elif lab == 'Intercept':
            idx = labs.index('Intercept')
            labs.pop(idx)
            nos.pop(idx)
    labs = [lab.upper() for lab in labs]
    x = range(len(nos))
    plt.plot(x,nos, lw=2, c='b')
    plt.xticks(x, labs)
    plt.title('Linear Regression Coefficients (Win Percentage)')
    plt.savefig('images/coefficients.png')
    plt.show()
    print labs
Example #18
0
def monary_load(start=0,stop=-1, find_args={}, species_to_retrieve=[]):
	if species_to_retrieve == []:
		species_to_retrieve = species
	else:
		species_to_retrieve = [s for s in species_to_retrieve if s in species]
	query = {}
	for s in species_to_retrieve:
		query[s] = {"$gt": 0}
	find_args["$or"] = [{k:query[k]} for k in query.keys()]
	with Monary("127.0.0.1") as monary:
		out = monary.query(
			"creeval",
			collection,
			find_args,
			num_metadata+cat_metadata+species_to_retrieve,
			["float32"] * (len(num_metadata)+len(cat_metadata)+len(species_to_retrieve)),
			limit=(stop-start),
			offset=start
		)
	for i,col in enumerate(out[0:len(num_metadata+cat_metadata)]):
		out[i] = np.ma.filled(col,np.ma.mean(col))
		#if any(np.isnan(col)):
		#	print col
	out = np.ma.row_stack(out).T
	X = out[:,0:len(num_metadata+cat_metadata)]
	y = out[:,len(num_metadata+cat_metadata):]
	y = (y > 0).astype(int)

	scaler = StandardScaler().fit(X)
	X = scaler.transform(X)
	pickle.dump(scaler,open(collection+"_scaler.pkl","wb"))
	y = np.asarray(y)

	return DenseDesignMatrix(X=X,y=y)
class PCATransform(BaseEstimator, TransformerMixin):
    """
    PCA with an argument that allows the user to skip the transform
    altogether.
    """
    def __init__(self, n_components=.1, skip=False, whiten=False, standard_scalar=True):
        print 'PCA!'
        self.n_components = n_components
        self.skip = skip
        self.whiten = whiten
        self.standard_scalar = standard_scalar

    def fit(self, X, y=None):
        if not self.skip:
            if self.standard_scalar:
                self.std_scalar = StandardScaler().fit(X)
                X = self.std_scalar.transform(X)
            self.pca = PCA(n_components=self.n_components, whiten=self.whiten).fit(X)
        return self

    def transform(self, X, y=None):
        if not self.skip:
            if self.standard_scalar:
                X = self.std_scalar.transform(X)
            return self.pca.transform(X)
        return X
Example #20
0
def main(trainFile, testFile, outputFile, mode, classifier):
    """
    input:
        1. trainFile: the training data features file
        2. testFile: the test data file
        3. outputFile: the file where the output of the test data has to be written
        4. classifier: the classifier to be used
    """
    # scale the input data
    scaler = StandardScaler()
    trainingData = getData(trainFile)
    trainX = trainingData[0]
    trainY = trainingData[1]
    trainX = scaler.fit_transform(trainX)
    testX = []
    testY = []
    # train the classifier
    clf = trainClassifier(trainX, trainY, classifier, mode)
    # if test mode, get test data and predict the output classes
    if mode == 1:
        testData = getData(testFile)
        testX = testData[0]
        testY = testData[1]
        testX = scaler.transform(testX)
        actY = test(testX, clf)
        testY = testY.reshape(len(testY), 1)
        # write the predicted class probabilities
        output = np.concatenate((testY, actY), axis = 1)
        np.savetxt(outputFile, output, fmt='%s', delimiter=',')
Example #21
0
def process(discrete, cont):
  # Create discrete and continuous data matrices
  discrete_X = np.array(discrete)
  cont_X = np.array(cont)

  # Impute discrete values
  imp = Imputer(strategy='most_frequent')
  discrete_X = imp.fit_transform(discrete_X)

  # Impute continuous values
  imp_c = Imputer(strategy='mean')
  cont_X = imp_c.fit_transform(cont_X)

  # Discrete basis representation
  enc = OneHotEncoder()
  enc.fit(discrete_X)
  discrete_X = enc.transform(discrete_X).toarray()

  # Continuous scaling
  scaler = StandardScaler()
  scaler.fit(cont_X)
  cont_X = scaler.transform(cont_X)

  # Merge to one array
  X = np.concatenate((discrete_X, cont_X), axis=1)
  return X
Example #22
0
class Classifier(BaseEstimator):
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.clf = None        
 
    def fit(self, X, y):        
        X = self.scaler.fit_transform(X.astype(np.float32))              
        y = self.label_encoder.fit_transform(y).astype(np.int32)
        dtrain = xgb.DMatrix( X, label=y.astype(np.float32))
        
        param = {'objective':'multi:softprob', 'eval_metric':'mlogloss'}
        param['nthread'] = 4
        param['num_class'] = 9
        param['colsample_bytree'] = 0.55
        param['subsample'] = 0.85
        param['gamma'] = 0.95
        param['min_child_weight'] = 3.0
        param['eta'] = 0.05
        param['max_depth'] = 12
        num_round = 400 # to be faster ??  
        #num_round = 820
        
        self.clf = xgb.train(param, dtrain, num_round)  
 
    def predict(self, X):
        X = self.scaler.transform(X.astype(np.float32))
        dtest = xgb.DMatrix(X)       
        label_index_array = np.argmax(self.clf.predict(dtest), axis=1)
        return self.label_encoder.inverse_transform(label_index_array)
 
    def predict_proba(self, X):
        X = self.scaler.transform(X.astype(np.float32))
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
Example #23
0
def normalize( training_data, test_data ):
	scaler = StandardScaler()
	values = scaler.fit_transform( training_data )
	training_data = pd.DataFrame( values, columns=training_data.columns, index=training_data.index )
	values = scaler.transform( test_data )
	test_data = pd.DataFrame( values, columns=test_data.columns, index=test_data.index )
	return training_data, test_data 
Example #24
0
def load_data_csv_advanced(datafile):
    """
    Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns.
    :param datafile: path of the file
    :return: a NumPy array containing a data point in each row
    """

    # File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location
    # will be at the column named 'x' in the CSV file.
    _COLUMN_X = 'x'
    _COLUMN_Y = 'y'

    data = pd.read_csv(datafile)

    # Normalize
    scaler = StandardScaler()
    scaler.fit(data[[_COLUMN_X, _COLUMN_Y]])
    data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]])

    #  Get feature vector names by removing "x" and "y"
    feature_vector_names = data.columns.difference([_COLUMN_X, _COLUMN_Y])
    data_coords = data[[_COLUMN_X, _COLUMN_Y]].values

    result = {"coordinates": data_coords}

    for feature in feature_vector_names:
        data_words = [[e.strip() for e in venue_data.split(",")] for venue_data in data[feature].values.flatten().tolist()]

        result[feature] = data_words

    return sparsify_data(result, None, None), scaler  # None for both params since SVD is not used
def lassoRegression(X,y):

    print("\n### ~~~~~~~~~~~~~~~~~~~~ ###")
    print("Lasso Regression")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myDegree = 40
    polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False)
    Xp = polynomialFeatures.fit_transform(X)

    myScaler = StandardScaler()
    scaled_Xp = myScaler.fit_transform(Xp)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    lassoRegression = Lasso(alpha=1e-7)
    lassoRegression.fit(scaled_Xp,y)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    dummyX = np.arange(0,2,0.01)
    dummyX = dummyX.reshape((dummyX.shape[0],1))
    dummyXp = polynomialFeatures.fit_transform(dummyX)
    scaled_dummyXp = myScaler.transform(dummyXp)
    dummyY = lassoRegression.predict(scaled_dummyXp)

    outputFILE = 'plot-lassoRegression.png'
    fig, ax = plt.subplots()
    fig.set_size_inches(h = 6.0, w = 10.0)
    ax.axis([0,2,0,15])
    ax.scatter(X,y,color="black",s=10.0)
    ax.plot(dummyX, dummyY, color='red', linewidth=1.5)
    plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
def prepare_features(data, enc=None, scaler=None):
    '''
    One-hot encode all boolean/string (categorical) features,
    and shift/scale integer/float features
    '''
    # X needs to contain only non-negative integers
    bfs = data['bfeatures'] + 1
    sfs = data['sfeatures'] + 1
    
    # Shift/scale integer and float features to have mean=0, std=1
    ifs = data['ifeatures']
    ffs = data['ffeatures']
    x2 = np.hstack((ifs,ffs))
    if scaler is None:
        scaler = StandardScaler()
        x2 = scaler.fit_transform(x2)
        print "Training features have mean: %s" % scaler.mean_
        print "and standard deviation: %s" % scaler.std_
    else:
        x2 = scaler.transform(x2, copy=False)
        
    # one-hot encode categorical features
    X = np.hstack((bfs,sfs,x2))
    categorical = np.arange(bfs.shape[1]+sfs.shape[1])
    if enc is None:
        enc = OneHotEncoder(n_values='auto', categorical_features=categorical)
        X = enc.fit_transform(X)
        print "One-hot encoded features have dimension %d" % X.shape[1]
    else:
        X = enc.transform(X)
    return X, enc, scaler
Example #27
0
def load_data_csv(datafile):
    """
    Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns.
    :param datafile: path of the file
    :return: a NumPy array containing a data point in each row
    """

    # File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location
    # will be at the column named 'x' in the CSV file.
    # This will be useful later when we start adding more features.
    _COLUMN_X = 'x'
    _COLUMN_Y = 'y'
    _COLUMN_W = 'color'

    data = pd.read_csv(datafile)

    # Normalize
    scaler = StandardScaler()
    scaler.fit(data[[_COLUMN_X, _COLUMN_Y]])
    data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]])

    data_coords = data[[_COLUMN_X, _COLUMN_Y]].values
    data_words = [[e] for e in data[[_COLUMN_W]].values.flatten().tolist()]

    data = {"coordinates": data_coords, "words": data_words}

    return sparsify_data(data, None, None), scaler  # None for both params since SVD is not used
def run_model( model, model_name, X, Y, X_val):

    new_values = [ [x] for x in range(len(X))]
    X = numpy.append(X, new_values, 1)
    from sklearn.preprocessing import StandardScaler # I have a suspicion that the classifier might work better without the scaler
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    max_time_val = X[-1][-1] *2 - X[-2][-1]

    Y = make_black_maps_class(Y)
    # Load validation data
    model.fit(X, Y)

    new_values = [ [max_time_val] for x in range(len(X_val))]
    X_val = numpy.append(X_val, new_values, 1)

    # Now predict validation output
    Y_pred = model.predict(X_val)

    # Crop impossible values
    Y_pred[Y_pred < 0] = 0
    Y_pred[Y_pred > 600] = 600

    savetxt('final_pred_y{0}.csv'.format(model_name), Y_pred, delimiter=',')

    black_map_count = 0
    for y in Y_pred:
        if y == 600:
            black_map_count += 1

    print black_map_count, model_name
    sys.stdout.flush()
Example #29
0
def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
def train_and_test(train_books, test_books, train, scale=True):
    X_train, y_train, cands_train, features = get_pair_data(train_books, True)
    X_test, y_test, cands_test, features = get_pair_data(test_books)

    scaler = None
    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    print sum(y_train)*0.1/len(y_train)
    print 'Start training'
    print X_train.shape
    clf = train(X_train, y_train)
    print 'Done training'
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)

    '''
    # print performance for training books
    print "--------------Traning data-------------"
    train_perf = evaluate_books(clf, train_books, scaler, evaluate_pair)

   # print performance for testing books
    print "\n"
    print "--------------Testing data-------------"
    test_perf = evaluate_books(clf, test_books, scaler, evaluate_pair)
    '''
    print 'Train Non-unique Precision:', precision(y_train_pred, y_train), 'Non-unique Recall:', recall(y_train_pred, y_train)
    print 'Test Non-unique Precision:', precision(y_test_pred, y_test), 'Recall:', recall(y_test_pred, y_test)
    return clf, scaler, X_train, y_train, X_test, y_test
Example #31
0
X.columns[feats.get_support()]





############ SEQUENTIAL FEATURE SELECTION #################

#Includes Forward Selection vs Backward selection
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=4, n_jobs=-1))])


# Forward Selection

selector = SequentialFeatureSelector(knn_pipe, scoring='accuracy', forward=True,
                                     floating=False, k_features=3,
                                     verbose=2, n_jobs=-1, cv=5)

selector.fit(X=X, y=y)


selector.subsets_

selector.k_feature_idx_
selector.k_feature_names_
Example #32
0
def run_sim(n, path, simtitle, REVOLUTION, PROBLEM, seed):

    POP_SIZE = 100
    MAX_GENERATION = 50
    MAX_EPISODE = 100
    MAX_EVAL = 1000
    STOPPING_RULE = 'max_eval'
    MUTATION_RATE = 0.1
    MUTATION_U = 0.
    MUTATION_ST = 0.2
    REF = [1., 1.]
    MINIMIZE = True
    VERBOSE = True
    X_SCALER = StandardScaler()

    # Set global numpy random_state
    np.random.seed(seed)

    theo = PROBLEM.solutions()

    # Instantiate a population
    pop = MOPRISM(size=POP_SIZE,
                  problem=PROBLEM,
                  max_generation=MAX_GENERATION,
                  max_episode=MAX_EPISODE,
                  reference=REF,
                  minimize=MINIMIZE,
                  stopping_rule=STOPPING_RULE,
                  max_eval=MAX_EVAL,
                  mutation_rate=MUTATION_RATE,
                  revolution=REVOLUTION,
                  embedded_ea=MOEAD,
                  verbose=VERBOSE,
                  no_improvement_step_tol=3)

    pop.selection_fun = pop.compute_front
    pop.mutation_fun = gaussian_mutator
    pop.crossover_fun = random_crossover

    # Parametrization
    params_ea = {
        'u': MUTATION_U,
        'st': MUTATION_ST,
        'trial_method': 'lhs',
        'trial_criterion': 'cm'
    }

    kernel = CubicKernel
    tail = LinearTail

    params_surrogate = \
        {'kernel': kernel,
         'tail': tail,
         'maxp': MAX_EVAL + POP_SIZE,
         'eta': 1e-8,
         }

    # ===============================Initialization============================
    pop.config_surrogate(typ='rbf',
                         params=params_surrogate,
                         n_process=1,
                         X_scaler=X_SCALER,
                         warm_start=True)

    pop.config_gap_opt(at='least_crowded',
                       radius=0.1,
                       size=POP_SIZE,
                       max_generation=MAX_GENERATION,
                       selection_fun=None,
                       mutation_fun=None,
                       mutation_rate=None,
                       crossover_fun=random_crossover,
                       trial_method='lhs',
                       trial_criterion='cm',
                       u=0.,
                       st=0.2)

    pop.config_sampling(methods='default',
                        sizes='default',
                        rate='default',
                        candidates='default')

    pop.run(params_ea=params_ea, params_surrogate=params_surrogate, theo=theo)

    # ============================= Save Results ================================ #
    # path to save
    directory = path + simtitle + '/' + str(n) + '/'
    if not os.path.exists(directory):
        os.makedirs(directory)
    pop.render_features(pop.true_front).tofile(directory + 'xs.dat')
    pop.render_targets(pop.true_front).tofile(directory + 'fs.dat')
    np.array(pop.hypervol_diff).tofile(directory + 'hv_diff.dat')
    np.array(pop.hypervol_cov_effect).tofile(directory + 'hv_cov.dat')
    np.array(pop.hypervol_index).tofile(directory + 'hv_ind.dat')

    # ================================Visualization============================== #
    # plot_res(pop=pop, ref=theo, directory=directory)

    return pop
Example #33
0
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

if __name__ == "__main__":
    dt_heart = pd.read_csv("./data/heart.csv")
    print(dt_heart.head(5))

    dt_features = dt_heart.drop(["target"], axis=1)
    dt_target = dt_heart["target"]
    dt_features = StandardScaler().fit_transform(dt_features)

    X_train, X_test, y_train, y_test = train_test_split(dt_features,
                                                        dt_target,
                                                        test_size=0.3,
                                                        random_state=42)

    print(X_train.shape, y_train.shape)

    pca = PCA(n_components=3)
    pca.fit(X_train)

    ipca = IncrementalPCA(n_components=3, batch_size=10)
    ipca.fit(X_train)

    #plt.plot(range(len(pca.explained_variance_)), pca.explained_variance_ratio_)
def one_day_window_model():
    files = glob.glob('../DATA/A*/A*/*.json')
    sentimentAnalyzer = SentimentIntensityAnalyzer()

    with open('tweet_sentiment.csv', 'w+') as sfl:
        for file in files:
            with open(file) as fl:
                lines = fl.readlines()
                tweets = json.loads(lines[0])
                for tweet in tweets:
                    date = time.strftime('%Y/%m/%d',
                                         time.localtime(int(tweet['time'])))
                    scores = sentimentAnalyzer.polarity_scores(tweet['text'])
                    sfl.write(date + ',' + str(scores['pos']) + ',' +
                              str(scores['neg']) + ',' + str(scores['neu']) +
                              ',' + str(scores['compound']))
                    sfl.write('\n')

    prices = pd.read_csv('../DATA/CHARTS/APPLE1440.csv').values
    all_tweets = pd.read_csv('tweet_sentiment.csv').values

    with open('features.csv', 'w+') as fl:
        for price in prices:
            current_date = datetime.strptime(price[0], '%Y.%m.%d').date()
            previous_date = current_date - timedelta(days=1)
            tweets = all_tweets[all_tweets[:, 0] == previous_date.strftime(
                '%Y/%m/%d')]

            if len(tweets) != 0:
                if float(price[5]) > float(price[2]):
                    label = "1"
                else:
                    label = "0"

                for tweet in tweets:
                    fl.write(price[0] + ',' + str(tweet[1]) + ',' +
                             str(tweet[2]) + ',' + str(tweet[3]) + ',' +
                             str(tweet[4]) + ',' + label)
                    fl.write('\n')

    dataset = pd.read_csv('features.csv')
    X = dataset.iloc[:, [1, 2, 3, 4]].values
    y = dataset.iloc[:, 5].values

    scaler = StandardScaler()
    X[:, :] = scaler.fit_transform(X[:, :])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    rf = RandomForestClassifier(n_estimators=500,
                                criterion='entropy',
                                max_depth=3)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print("Random Forest Accuracy: " +
          str((cm[0, 0] + cm[1, 1]) /
              (cm[0, 0] + cm[1, 1] + cm[1, 0] + cm[0, 1])))

    svc = SVC(kernel='poly', random_state=0)
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print("SVM Accuracy: " + str((cm[0, 0] + cm[1, 1]) /
                                 (cm[0, 0] + cm[1, 1] + cm[1, 0] + cm[0, 1])))

    mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100),
                        random_state=10)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print("MLP Accuracy: " + str((cm[0, 0] + cm[1, 1]) /
                                 (cm[0, 0] + cm[1, 1] + cm[1, 0] + cm[0, 1])))
X_train_norm = mms.fit_transform(X_train)

print(X_train_norm)

X_test_norm = mms.transform(X_test)

print(X_test)

# 标准化对于许多线性模型都十分有必要

print('standardized:', (ex - ex.mean()) / ex.std())

from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()

X_train_std = stdsc.fit_transform(X_train)

X_test_std = stdsc.transform(X_test)

# 选择有意义的特征

# 正则化

print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l1', C=1.0)
lr.fit(X_train_std, y_train)
import DatabaseConnection as dc

# Connect to database and get the data
home_data = dc.download_housing_data(dc.connect())

# Filter out outliers in the dataset based on the outlier graph from preprocessing
# indicating that the majority of the data is in the lower end of the price range.
home_data = home_data[home_data['price'] < 1250000]

# Visualize the different groupings of houses in the dataset using kmeans cluster analysis.
cata_home_data = home_data.copy() # copy of the dataset to maintain integrity
price_data = cata_home_data['price']
cata_home_data.drop(['price'],axis=1,inplace=True)

# Standardize the data.
scaler = StandardScaler()
scaler.fit(cata_home_data)
scaled_data = scaler.transform(cata_home_data)

# Determine the obtimal number of clusters
def show_elbow_plot():
    """
    Displays the elbow graph used to choose the optimal number of clusters for the final cluster graph.
    """

    test_cluster_max = 15
    kmeans_tests = [KMeans(n_clusters=i) for i in range(1, test_cluster_max)]
    score = [kmeans_tests[i].fit(scaled_data).score(scaled_data) for i in range(len(kmeans_tests))]

    # Plot the curve
    elbow_plot = plt.plot(range(1, test_cluster_max),score)
Example #37
0
#%%
# lets do PCA
sns.set_style("darkgrid")


colors = ['#e6194b',
          '#0082c8',
          '#d2f53c',
          '#3cb44b',
          '#f032e6',
          '#911eb4',
          '#46f0f0',
          '#f58231', 
          '#008080',
          '#ffe119']
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)

# visulize all attributes in the data set
num_components = 2
pca = PCA(n_components = num_components)
pca.fit(X_train)
print(pca.explained_variance_ratio_)
print('var', sum(pca.explained_variance_ratio_)) 

total_explained_variance = sum(pca.explained_variance_ratio_)

train_pca = pca.transform(X_train) # make this more pretty l8ter
records, attributes = np.shape(train_pca)
train_pca_ones = np.ones((records, attributes + 1))
train_pca_ones[:,1:] = train_pca
Example #38
0
def validateRF():
    """
    run KFOLD method for regression 
    """

    #defining directories
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/merraRFValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    #cd to the lagged predictors directory
    os.chdir(dir_in)

    x = 119
    y = 120

    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])

    #looping through
    for tg in range(x, y):

        os.chdir(dir_in)

        #filter only .csv files
        tgNames = []
        for file in glob.glob("*.csv"):
            tgNames.append(file)

        tg_name = sorted(tgNames)[tg]
        print(tg_name)

        ##########################################
        #check if this tg is already taken care of
        ##########################################
        os.chdir(dir_out)
        if os.path.isfile(tg_name):
            print("this tide gauge is already taken care of")
            return "file already analyzed!"

        os.chdir(dir_in)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis=1, inplace=True)

        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1)

        #standardize predictor data
        dat = pred.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1)

        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis=1, inplace=True)

        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index,
                   axis=0,
                   inplace=True)
        surge.reset_index(inplace=True)
        surge.drop('index', axis=1, inplace=True)

        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])),
                                  columns=['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]],
                              axis=1)

        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized,
                              surge_new.iloc[:, :2],
                              on='date',
                              how='right')
        pred_surge.sort_values(by='date', inplace=True)

        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis=1)]
        pred_surge.drop(row_nan.index, axis=0, inplace=True)
        pred_surge.reset_index(inplace=True)
        pred_surge.drop('index', axis=1, inplace=True)

        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-' * 80)
            print('Predictors and Surge don' 't overlap')
            print('-' * 80)
            continue


        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])

        #prepare data for training/testing
        X = pred_surge.iloc[:, 1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis=1, inplace=True)

        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)

        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)

        metric_corr = []
        metric_rmse = []
        #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]

            #train regression model
            rf= RandomForestRegressor(n_estimators = 50, random_state = 101, \
                                      min_samples_leaf = 1)
            rf.fit(X_train, y_train)

            #predictions
            predictions = rf.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)

            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                print()
                metric_rmse.append(
                    np.sqrt(metrics.mean_squared_error(y_test, predictions)))

        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1]  #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)

        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')

        #original size and pca size of matrix added
        new_df = pd.DataFrame(
            [tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis=0)

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        df.to_csv(tg_name)
Example #39
0
def train_and_predict_dragons(t,
                              y_unscaled,
                              x,
                              targeted_regularization=True,
                              output_dir='',
                              knob_loss=dragonnet_loss_binarycross,
                              ratio=1.,
                              dragon='',
                              val_split=0.2,
                              batch_size=64):
    verbose = 0
    y_scaler = StandardScaler().fit(y_unscaled)
    y = y_scaler.transform(y_unscaled)
    train_outputs = []
    test_outputs = []

    if dragon == 'tarnet':
        dragonnet = make_tarnet(x.shape[1], 0.01)

    elif dragon == 'dragonnet':
        print("I am here making dragonnet")
        dragonnet = make_dragonnet(x.shape[1], 0.01)

    metrics = [
        regression_loss, binary_classification_loss, treatment_accuracy,
        track_epsilon
    ]

    if targeted_regularization:
        loss = make_tarreg_loss(ratio=ratio, dragonnet_loss=knob_loss)
    else:
        loss = knob_loss

    # for reporducing the IHDP experimemt

    i = 0
    tf.random.set_seed(i)
    np.random.seed(i)
    # print()
    train_index, test_index = train_test_split(np.arange(x.shape[0]),
                                               random_state=1)
    test_index = train_index

    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    t_train, t_test = t[train_index], t[test_index]

    yt_train = np.concatenate([y_train, t_train], 1)

    import time
    start_time = time.time()

    dragonnet.compile(optimizer=Adam(lr=1e-3), loss=loss, metrics=metrics)

    adam_callbacks = [
        TerminateOnNaN(),
        EarlyStopping(monitor='val_loss', patience=2, min_delta=0.),
        ReduceLROnPlateau(monitor='loss',
                          factor=0.5,
                          patience=5,
                          verbose=verbose,
                          mode='auto',
                          min_delta=1e-8,
                          cooldown=0,
                          min_lr=0)
    ]

    dragonnet.fit(x_train,
                  yt_train,
                  callbacks=adam_callbacks,
                  validation_split=val_split,
                  epochs=100,
                  batch_size=batch_size,
                  verbose=verbose)

    sgd_callbacks = [
        TerminateOnNaN(),
        EarlyStopping(monitor='val_loss', patience=40, min_delta=0.),
        ReduceLROnPlateau(monitor='loss',
                          factor=0.5,
                          patience=5,
                          verbose=verbose,
                          mode='auto',
                          min_delta=0.,
                          cooldown=0,
                          min_lr=0)
    ]

    sgd_lr = 1e-5
    momentum = 0.9
    dragonnet.compile(optimizer=SGD(lr=sgd_lr,
                                    momentum=momentum,
                                    nesterov=True),
                      loss=loss,
                      metrics=metrics)
    dragonnet.fit(x_train,
                  yt_train,
                  callbacks=sgd_callbacks,
                  validation_split=val_split,
                  epochs=300,
                  batch_size=batch_size,
                  verbose=verbose)

    elapsed_time = time.time() - start_time
    print("***************************** elapsed_time is: ", elapsed_time)

    yt_hat_test = dragonnet.predict(x_test)
    yt_hat_train = dragonnet.predict(x_train)

    test_outputs += [
        _split_output(yt_hat_test, t_test, y_test, y_scaler, x_test,
                      test_index)
    ]
    train_outputs += [
        _split_output(yt_hat_train, t_train, y_train, y_scaler, x_train,
                      train_index)
    ]
    K.clear_session()

    return test_outputs, train_outputs
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10,
                                    criterion='entropy',
                                    random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
#### Initialize Dataset ####

data_df = pd.read_csv(settings['dataset_file'])

data = {
    'outcome_name': data_df.columns[0],
    'variable_names': data_df.columns[1:].tolist(),
    'X': data_df.iloc[:, 1:],
    'y': data_df.iloc[:, 0],
    'scaler': None,
    }

if settings['normalize_data']:
    from sklearn.preprocessing import StandardScaler
    data['scaler'] = StandardScaler(copy = True, with_mean = True, with_std = True)
    data['X_train'] = pd.DataFrame(data['scaler'].fit_transform(data['X'], data['y']), columns = data['X'].columns)
else:
    data['X_train'] = data['X']

#### Initialize Actionset ####

default_bounds = (0.1, 99.9, 'percentile')
custom_bounds = None
immutable_variables = []


if settings['data_name'] == 'credit':

    immutable_names = ['Female', 'Single', 'Married']
    immutable_names += list(filter(lambda x: 'Age' in x or 'Overdue' in x, data['variable_names']))
Example #42
0
def train_and_predict_ned(t,
                          y_unscaled,
                          x,
                          targeted_regularization=True,
                          output_dir='',
                          knob_loss=dragonnet_loss_binarycross,
                          ratio=1.,
                          dragon='',
                          val_split=0.2,
                          batch_size=64):
    verbose = 0
    y_scaler = StandardScaler().fit(y_unscaled)
    y = y_scaler.transform(y_unscaled)

    train_outputs = []
    test_outputs = []

    nednet = make_ned(x.shape[1], 0.01)

    metrics_ned = [ned_loss]
    metrics_cut = [regression_loss]

    # for reproducing the ihdp result
    i = 0

    tf.random.set_random_seed(i)
    np.random.seed(i)

    # change the test_size to get in sample and out sample estimates

    test_size = 0.
    train_index, test_index = train_test_split(np.arange(x.shape[0]),
                                               test_size=test_size)
    if test_size == 0:
        test_index = train_index

    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    t_train, t_test = t[train_index], t[test_index]
    yt_train = np.concatenate([y_train, t_train], 1)

    nednet.compile(optimizer=Adam(lr=1e-3), loss=ned_loss, metrics=metrics_ned)

    adam_callbacks = [
        TerminateOnNaN(),
        EarlyStopping(monitor='val_loss', patience=2, min_delta=0.),
        ReduceLROnPlateau(monitor='loss',
                          factor=0.5,
                          patience=5,
                          verbose=verbose,
                          mode='auto',
                          min_delta=1e-8,
                          cooldown=0,
                          min_lr=0)
    ]

    nednet.fit(x_train,
               yt_train,
               callbacks=adam_callbacks,
               validation_split=val_split,
               epochs=100,
               batch_size=batch_size,
               verbose=verbose)

    sgd_callbacks = [
        TerminateOnNaN(),
        EarlyStopping(monitor='val_loss', patience=40, min_delta=0.),
        ReduceLROnPlateau(monitor='loss',
                          factor=0.5,
                          patience=5,
                          verbose=verbose,
                          mode='auto',
                          min_delta=0.,
                          cooldown=0,
                          min_lr=0)
    ]

    sgd_lr = 1e-5
    momentum = 0.9
    nednet.compile(optimizer=SGD(lr=sgd_lr, momentum=momentum, nesterov=True),
                   loss=ned_loss,
                   metrics=metrics_ned)
    print(nednet.summary())
    nednet.fit(x_train,
               yt_train,
               callbacks=sgd_callbacks,
               validation_split=val_split,
               epochs=300,
               batch_size=batch_size,
               verbose=verbose)

    t_hat_test = nednet.predict(x_test)[:, 1]
    t_hat_train = nednet.predict(x_train)[:, 1]

    # cutting the activation layer
    cut_net = post_cut(nednet, x.shape[1], 0.01)

    cut_net.compile(optimizer=Adam(lr=1e-3),
                    loss=dead_loss,
                    metrics=metrics_cut)

    adam_callbacks = [
        TerminateOnNaN(),
        EarlyStopping(monitor='val_loss', patience=2, min_delta=0.),
        ReduceLROnPlateau(monitor='loss',
                          factor=0.5,
                          patience=5,
                          verbose=verbose,
                          mode='auto',
                          min_delta=1e-8,
                          cooldown=0,
                          min_lr=0)
    ]

    cut_net.fit(x_train,
                yt_train,
                callbacks=adam_callbacks,
                validation_split=val_split,
                epochs=100,
                batch_size=batch_size,
                verbose=verbose)

    sgd_callbacks = [
        TerminateOnNaN(),
        EarlyStopping(monitor='val_loss', patience=40, min_delta=0.),
        ReduceLROnPlateau(monitor='loss',
                          factor=0.5,
                          patience=5,
                          verbose=verbose,
                          mode='auto',
                          min_delta=0.,
                          cooldown=0,
                          min_lr=0)
    ]

    sgd_lr = 1e-5
    momentum = 0.9
    cut_net.compile(optimizer=SGD(lr=sgd_lr, momentum=momentum, nesterov=True),
                    loss=dead_loss,
                    metrics=metrics_cut)

    cut_net.fit(x_train,
                yt_train,
                callbacks=sgd_callbacks,
                validation_split=val_split,
                epochs=300,
                batch_size=batch_size,
                verbose=verbose)

    y_hat_test = cut_net.predict(x_test)
    y_hat_train = cut_net.predict(x_train)

    yt_hat_test = np.concatenate([y_hat_test, t_hat_test.reshape(-1, 1)], 1)
    yt_hat_train = np.concatenate([y_hat_train, t_hat_train.reshape(-1, 1)], 1)

    test_outputs += [
        _split_output(yt_hat_test, t_test, y_test, y_scaler, x_test,
                      test_index)
    ]
    train_outputs += [
        _split_output(yt_hat_train, t_train, y_train, y_scaler, x_train,
                      train_index)
    ]
    K.clear_session()

    return test_outputs, train_outputs
# ### Regression
# ##### I think that this problem may be best generalized with a random forest model. First I will start with a regressor, then I will use a classifier

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

X = df[['temperature', 'humidity', 'IsHoliday', 'WeekDay', 'Season']]
y = df['P1']
today = [12, 47, 0, 1, 1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

sc_regr = StandardScaler()
X_train = sc_regr.fit_transform(X_train)
X_test = sc_regr.transform(X_test)

today = sc_regr.transform([today])

regr = RandomForestRegressor()
regr_svm = SVR()

regr.fit(X_train, y_train)
regr_svm.fit(X_train, y_train)

regr.predict(today)

regr_svm.predict(today)
            print (f)
            x = getDummy(x, f)
    test = x.iloc[260753:, ]
    train = x.iloc[:260753:, ]

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train).astype(np.int32)
y_train = np_utils.to_categorical(y_train)

print ("processsing finished")
train = np.array(train)
train = train.astype(np.float32)
test = np.array(test)
test = test.astype(np.float32)
if need_normalise:
    scaler = StandardScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)

# folds
xfolds = pd.read_csv(projPath + 'input/xfolds.csv')
# work with 5-fold split
fold_index = xfolds.fold5
fold_index = np.array(fold_index) - 1
n_folds = len(np.unique(fold_index))

nb_classes = 2
print(nb_classes, 'classes')

dims = train.shape[1]
print(dims, 'dims')
Example #45
0
)

vals = {}
for i, name in enumerate(names):
    vals[name] = X[:, i]
vals[dataset.default_target_attribute] = y
df = pd.DataFrame(vals)

X = df.drop(task_target, axis=1)
y = df.loc[:, task_target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#:# preprocessing

transform_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns)

#:# model

params = {'C': 0.8, 'solver': 'liblinear'}

classifier = LogisticRegression(**params)
classifier.fit(X_train, y_train)

#:# hash
#:# 8b4aedd7d78f7193c71d75501c5c1bc6
md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()
print(f'md5: {md5}')
Example #46
0
    xx.append([])
    for j in [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
        xx[i].append(data[i][j])
x = np.array(xx)
y = np.array(yy)
# print(len(x),len(y))

# 2 分割训练数据和测试数据
# 随机采样25%作为测试 75%作为训练
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=33)

# 3 训练数据和测试数据进行标准化处理
ss_x = StandardScaler()
x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)

ss_y = StandardScaler()
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1, 1))

# 4 使用回归树进行训练和预测
# 初始化k近邻回归模型 使用平均回归进行预测
dtr = DecisionTreeRegressor()
# 训练
dtr.fit(x_train, y_train)
# 预测 保存预测结果
dtr_y_predict = dtr.predict(x_test)
np.random.seed(seed=0)
random_state = 0

#######
# DADOS:
#######

# a) para aplicacao com dados 'reais', utilizamos o Wine dataset:
# Nesse caso, apos gerar os dados abaixo, rodar o codigo somente a partir da linha 54 ateh 227, para evitar as replicados da simulacao de monte carlo.

wine = datasets.load_wine()
X = wine.data
y = wine.target
data = DataFrame(X)
# padronizacao das variaveis para ficarem em uma mesma escala:
scaler = StandardScaler()
data = DataFrame(scaler.fit_transform(data), index=data.index, columns=data.columns)
# visualizacao dos dados apenas ateh a quarta variavel, pra nao poluir tanto o grafico:
plot = sns.pairplot(data.iloc[:,0:4])
cols = data.columns

# ou:

# b) com dados simulados:

# quantidade de replicacoes:
reps = 1000
acertou = []

for rep in tqdm(range(reps)):
Example #48
0
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import *

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

#data = pd.read_csv("../../data/data_all_float.csv", header=0, index_col=None, sep=';')

# drop 'CTE, linear' for more instances to study
drop_feature = ['Density', 'CTE, linear']
for ifeature in drop_feature:
    data = data.drop(labels=ifeature, axis=1)

stdscale = StandardScaler()

elements = [
    'Iron, Fe', 'Carbon, C', 'Sulfur, S', 'Silicon, Si', 'Phosphorous, P',
    'Manganese, Mn', 'Chromium, Cr', 'Nickel, Ni', 'Molybdenum, Mo',
    'Copper, Cu'
]

target = 'Thermal Conductivity'
# drop instances with NaN
drop_instance = []
for idx in data.index:
    if math.isnan(data.loc[idx, target]):
        drop_instance.append(idx)
data_loc = data.drop(drop_instance)
X = data_loc[elements].values
def pls_variable_selection(x, y, num_pls_components):
    """

    Adopted from https://nirpyresearch.com/variable-selection-method-pls-python/
    :param x:
    :param y:
    :param max_components:
    :param scorer:
    :return:
    """
    # initialize new model parameter holder
    scores = dict()
    scores['r2'] = ModelFit()
    scores['mae'] = ModelFit()
    cut_conditions = []
    num_varaibles = []
    # make a score table to fill in
    # scores = np.zeros( x.shape[1] )
    # print('==========')
    pls = PLSRegression(num_pls_components)
    usable_columns = None
    best_score = 0
    x_scaled_np = StandardScaler().fit_transform(x)
    x_scaled = pd.DataFrame(x_scaled_np, columns=x.columns)
    # print(x_scaled)
    while x_scaled.shape[1] >= num_pls_components:
        print('shape: ', x_scaled.shape, num_pls_components, best_score)
        number_to_cut = int(x_scaled.shape[1] / 100)
        if number_to_cut == 0:
            number_to_cut = 1
        # print('number to cut: ', number_to_cut, num_pls_components)
        pls.fit(x_scaled, y)
        # y_predict = pls.predict(x_scaled)
        # score = r2_score(y, y_predict)
        cv_splitter = 3  # passing to corss_validate will implement a KFold with 3 folds
        group_splitter = None
        if x_scaled.shape[1] <= 200:
            # cv_splitter = ShuffleSplit(n_splits=100, test_size=0.35)
            cv_splitter = GroupShuffleSplit(n_splits=100, test_size=0.35)
            group_splitter = data_full['Leaf number']
        elif x_scaled.shape[1] <= 400:
            # cv_splitter = ShuffleSplit(n_splits=30, test_size=0.35)
            cv_splitter = GroupShuffleSplit(n_splits=30, test_size=0.35)
            group_splitter = data_full['Leaf number']
        local_scores = cross_validate(
            pls,
            x_scaled,
            y,
            cv=cv_splitter,
            return_train_score=True,
            groups=group_splitter,
            scoring=['r2', 'neg_mean_absolute_error'])

        scores['r2'].train_score.append(local_scores['train_r2'].mean())
        scores['r2'].train_stdev.append(local_scores['train_r2'].std())
        scores['r2'].test_score.append(local_scores['test_r2'].mean())
        scores['r2'].test_stdev.append(local_scores['test_r2'].std())

        scores['mae'].train_score.append(
            local_scores['train_neg_mean_absolute_error'].mean())
        scores['mae'].train_stdev.append(
            local_scores['train_neg_mean_absolute_error'].std())
        scores['mae'].test_score.append(
            local_scores['test_neg_mean_absolute_error'].mean())
        scores['mae'].test_stdev.append(
            local_scores['test_neg_mean_absolute_error'].std())

        num_varaibles.append(x_scaled.shape[1])
        if scores['r2'].test_score[-1] > best_score:
            best_score = scores['r2'].test_score[-1]
            usable_columns = x_scaled.columns

        # print(pls.coef_[:, 0])
        # print(pls.coef_.shape)
        sorted_coeff = np.argsort(np.abs(pls.coef_[:, 0]))
        # print('1')
        # print(sorted_coeff)
        # print( pls.coef_[:, 0][sorted_coeff] )
        # print('2')
        # print(sorted_coeff[-5:])
        # print(sorted_coeff[-1])
        # print(x_scaled)
        # print(pls.coef_[:, 0][sorted_coeff[-1]], pls.coef_[:, 0][sorted_coeff[0]])
        # print(sorted_coeff[-1], x_scaled.columns[sorted_coeff[0]])
        # print(scores['r2'].train_score[-1], scores['r2'].test_score[-1],
        #       scores['mae'].train_score[-1], scores['mae'].test_score[-1])
        # column_to_drop = x_scaled.columns[sorted_coeff[0]]
        columns_to_drop = x_scaled.columns[sorted_coeff[:number_to_cut]]
        # print(columns_to_drop.values)
        if x_scaled.shape[1] < 50:
            # print('dropping: ', columns_to_drop)
            # print(columns_to_drop.values)
            cut_conditions.append(columns_to_drop.values)

        x_scaled.drop(columns=columns_to_drop, inplace=True)

    # print(usable_columns)
    # print('===========')
    # print(x_scaled.columns)
    # print(cut_conditions)
    # data = dict()
    # data['test means'] = test_scores_average
    # data['test std'] = test_scores_std
    # data['train means'] = train_scores_average
    # data['train std'] = train_scores_std
    # data['columns'] = usable_columns
    # data['num variables'] = num_varaibles
    scores['columns'] = usable_columns
    scores['num variables'] = num_varaibles
    # print('========')
    # print(data.keys())
    # filename = "param_selector_{0}.pickle".format(num_pls_components)
    # with open(filename, 'wb') as f:
    #     pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
    return scores
Example #50
0
# Get dataset and features
#==============================#

aalist = list('ACDEFGHIKLMNPQRSTVWY')


def getAAC(seq):
    aac = np.array([seq.count(x) for x in aalist]) / len(seq)
    return aac


data = pd.read_excel('sequence_ogt_topt.xlsx', index_col=0)
aac = np.array([getAAC(seq) for seq in data['sequence']])
ogt = data['ogt'].values.reshape((data.shape[0], 1))
X = np.append(aac, ogt, axis=1)
sc = StandardScaler()
X = sc.fit_transform(X)
y = data['topt'].values

# Strategies and hyperparameters
#======================================#

# Hyperparameter range
cl_vals = [25.0, 30.0, None]
ch_vals = [72.2, 60.0]
ks = [5, 10, 15]
deltas = [0.1, 0.5, 1.0]
overs = [0.5, 0.75]
unders = [0.5, 0.75]
sizes = [300, 600]
sample_methods = ['balance', 'extreme', 'average']
def featureScal(X):
    sc_X = StandardScaler()
    X_train = sc_X.fit_transform(X)
    return X_train
Example #52
0
xgb_preds = np.expm1(model_xgb.predict(X_test))
lasso_preds = np.expm1(model_lasso.predict(X_test))

predictions = pd.DataFrame({"xgb":xgb_preds, "lasso":lasso_preds})
predictions.plot(x = "xgb", y = "lasso", kind = "scatter")

preds = 0.7*lasso_preds + 0.3*xgb_preds
solution = pd.DataFrame({"id":test.Id, "SalePrice":preds})
#solution.to_csv("lasso_xgb.csv", index = False)

from keras.layers import Dense
from keras.models import Sequential
from keras.regularizers import l1
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train = StandardScaler().fit_transform(X_train)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, random_state = 3)

print(X_tr.shape)
X_tr

model = Sequential()
#model.add(Dense(256, activation="relu", input_dim = X_train.shape[1]))
model.add(Dense(1, input_dim = X_train.shape[1], W_regularizer=l1(0.001)))
model.compile(loss = "mse", optimizer = "adam")

model.summary()
hist = model.fit(X_tr, y_tr, validation_data = (X_val, y_val))
pd.Series(model.predict(X_val)[:,0]).hist()
Example #53
0
from flask import Flask, render_template, request
import pickle
from sklearn.preprocessing import StandardScaler

app = Flask(__name__)

classifier = pickle.load(open('score.pkl', 'rb'))


@app.route('/', methods=['GET'])
def home():
    return render_template('index.html')


standard_to = StandardScaler()


@app.route("/predict", methods=['POST'])
def predict():
    if request.method == 'POST':
        grescore = int(request.form['grescore'])
        toeflscore = int(request.form['toeflscore'])
        rating = request.form['rating']
        SOP = float(request.form['sop'])
        LOR = float(request.form['lor'])
        CGPA = float(request.form['cgpa'])
        Research = request.form['Research']
        if Research == 'yes':
            Research = 1
        else:
            Research = 0
    # import data
    train = get_feature_data()
    df_orig = get_original_data()

    # Whether to vectorize original data or not
    vectorise_text = True
    if vectorise_text == True:
        vectorized_desciption = vectorize_text_feature(train, 'Description',15)
        vectorized_title = vectorize_text_feature(train, 'Country', 5)
        train2 =  train.drop(['InvoiceDate'],axis=1)
        train2 = pd.concat([train2, vectorized_desciption, vectorized_title], axis=1)
    else:
        train2 =  train.drop(['InvoiceDate'],axis=1)

    # standardize data
    scaler = StandardScaler()
    scaler = scaler.fit(train2)
    train_scaled = scaler.transform(train2)

    # use principle component analysis to reduce dimensions
    pca = PCA(n_components=20)
    pca = pca.fit(train_scaled)
    train_reduced = pca.transform(train_scaled)

    # calculate matrix of cosine similarity
    distances = cosine_similarity(train_reduced).T

    # find top 5 closest ads
    top_5_orig = []
    top_distances = []
    for i in range(len(train_reduced)):
def bayesian_regression_modeling(df,
                                 label_col,
                                 target_col,
                                 prior_distribution_list,
                                 draw_sample=1000,
                                 chains=2,
                                 scaling_opt=3):
    """

    :param df:
    :param label_col:
    :param target_col:
    :param model_option:
    :param draw_sample:
    :param chains:
    :param alpha_1:
    :param alpha_2:
    :param lambda_1:
    :param lambda_2:
    :return: MCMC mean trace array, MCMC visualization img source
    """
    # test: using scaling
    n_individuals = len(df)
    print("scale@@@@@",df)
    if scaling_opt == 1:
        df[df.columns] = StandardScaler().fit_transform(df[df.columns])
    elif scaling_opt == 2:
        df[df.columns] = MinMaxScaler().fit_transform(df[df.columns])
    elif scaling_opt == 3:
        df[df.columns] = df[df.columns]
    feature_list = df.columns
    feature_list = [i for i in feature_list if i != label_col]
    print(feature_list)
    print("Model datasett:BHJLK")
    print(df)

    # Using PyMC3
    # formula = str(target_col)+' ~ '+' + '.join(['%s' % variable for variable in label_col])

    # degree of freedom
    # nu = len(df[label_col].count(axis=1)) - len(df[label_col].count(axis=0))

    # TODO: add student-T distribution as priors for each feature
    # get degree of freedom
    if len(df[label_col].count(axis=1)) >= len(df[label_col].count(axis=0)):
        nu = len(df[label_col].count(axis=1)) - len(df[label_col].count(axis=0))
    else:
        nu = 0

    print("Degree of Freedom:")
    print(nu)

    # with pm.Model() as normal_model:
    #     start = time.time()
    #
    #     # intercept = pm.StudentT('Intercept', nu=nu, mu=0, sigma=1)
    #     # sigma = pm.HalfCauchy("sigma", beta=10, testval=1.)
    #     # pass in a prior mean & coefficient list
    #
    #
    #     family = pm.glm.families.Normal()
    #     pm.GLM.from_formula(formula, data=df, family=family)
    #     trace = pm.sample(draws=draw_sample, chains=chains, tune=500, random_seed=23)
    #     end = time.time()
    #     print("Time elasped: {} seconds.".format(end-start))
    #     print("DONE normal_trace")
    with pm.Model() as model:
        fea_list = [variable for variable in label_col]
        print(fea_list)
        pm_list = []
        i = 0
        for prior_dist in prior_distribution_list:
            #         for j in range(len(fea_list)):
            
            for type,prior in prior_dist.items():
                if type != "Target" and prior == "Normal":

                    #             print(fea_list[j])
                    pm_list.append(pm.Normal(str(fea_list[i])))
                    i += 1
                elif type != "Target" and prior == "Student T":   
                
                    pm_list.append(pm.StudentT(str(fea_list[i]), nu=nu))
                    i += 1
                elif type != "Target" and prior == "Skew Normal": 
                    pm_list.append(pm.SkewNormal(str(fea_list[i])))
                    i += 1
            # for i, item in enumerate(prior_distribution_list):Skew Normal
            #     setattr(sys.modules[__name__], 'beta{0}'.format(i), item)
            # hyper_sigma = pm.HalfNormal('hyper_sigma', sd=3)
        sigma = pm.HalfCauchy('sigma', beta=10, testval=1.)
        intercept = pm.Normal('Intercept', 0, sigma=20)

        # setting the distribution mean for the predictor
        mu = intercept
        print("MUUU")
        print(mu)
        for i in range(len(pm_list)):
            print("PM LIST i")
            print(pm_list[i])
            print("DF[FEA_LIST]")
            print(df[fea_list[i]])
            mu += pm_list[i] * df[fea_list[i]].to_numpy()
            #mu += pm_list[i] * np.ones(df[fea_list[i]].to_list())
            print(df[fea_list[i]].to_numpy())
        for prior_dist in prior_distribution_list:
            for type,prior in prior_dist.items():
                 if type == "Target" and prior == "Normal":
                    print("Target Normal")
                    likelihood = pm.Normal(str(target_col), mu=mu, sigma=sigma, observed=df[target_col])
                 elif type == "Target" and prior == "Student T":
                    print("Target Student")
                    likelihood = pm.StudentT(str(target_col), nu = nu , mu=mu, sd=sigma, shape = n_individuals)
                 elif type == "Target" and prior == "Skew Normal":
                    print("Target Skew")
                    mu = pm.Uniform('lambda_bl', 0., draw_sample)
                    likelihood = pm.SkewNormal(str(target_col),mu=mu, sigma=sigma,tau=None, alpha=1, sd=3)
                 elif type == "Target" and prior == nan:
                    print("Target Nan")
                    likelihood = pm.Normal(str(target_col), mu=mu, sigma=sigma, observed=df[target_col])
        #means = pm.StudentT('means', nu = nu, mu = hyper_mean, sd = hyper_sigma, shape = n_individuals)
        #SkewNormal(mu=0.0, sigma=None, tau=None, alpha=1, sd=None, *args, **kwargs)
        trace = pm.sample(draws=draw_sample, chains=chains, random_seed=23,progressbar=True)
        img_source = save_mat_fig(trace, gtype='traceplot')
        posterior_dist = save_mat_fig(trace, gtype='posterior')

        # return np.array([np.mean(trace[variable]) for variable in trace.varnames]), img_source, posterior_dist
        return trace, img_source, posterior_dist
def preprocess_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler
Example #57
0
# Format the features and labels for use with scikit learn
feature_list = []
label_list = []

for item in training_set:
    if np.isnan(item[0]).sum() < 1:
        feature_list.append(item[0])
        label_list.append(item[1])

print('Features in Training Set: {}'.format(len(training_set)))
print('Invalid Features in Training set: {}'.format(
    len(training_set) - len(feature_list)))

X = np.array(feature_list)
# Fit a per-column scaler
X_scaler = StandardScaler().fit(X)
# Apply the scaler to X
X_train = X_scaler.transform(X)
y_train = np.array(label_list)

# Convert label strings to numerical encoding
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

# Create classifier
clf = svm.SVC(kernel='linear')

# Set up 5-fold cross-validation
kf = cross_validation.KFold(len(X_train),
                            n_folds=5,
                            shuffle=True,
plt.style.use('seaborn-deep')
import matplotlib.cm
cmap = matplotlib.cm.get_cmap('plasma')

# Reading in data
ds = pd.read_csv("Social_Network_Ads.csv")
X = ds.iloc[:, 2:4].values
y = ds.iloc[:, 4].values

# Splitting and scaling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y)

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

# Classifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

clf = SVC(kernel='rbf', C=20)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = (cm[0][0] + cm[1][1]) / sum(sum(cm))

# Cross validation
Example #59
0
 def __init__(self, columns=None, **kwargs):
     self.columns        = columns
     self.model          = StandardScaler(**kwargs)
     self.transform_cols = None
Example #60
0
import pandas as pd
import plotly.figure_factory as ff

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.cluster import AgglomerativeClustering

import hvplot.pandas

# %%
df = pd.read_csv('new_iris_data.csv')

# %%
# Standardized first
iris_scale = StandardScaler().fit_transform(df)

# apply PCA to reduce to 2 pricipal components
pca_model = PCA(n_components=2, random_state=0)
iris_pca = pca_model.fit_transform(iris_scale)
print(pca_model.explained_variance_ratio_)
# %%
iris_pca_df = pd.DataFrame(data=iris_pca, columns=['PC_1', 'PC_2'])
iris_pca_df.head(-5)

# %%
# creating a dendrogram using plotly.figure_factory
fig = ff.create_dendrogram(iris_pca_df, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()
# the higher the horizontal lines, the less similarity there is between the clusters.