Example #1
0
def process(discrete, cont):
  # Create discrete and continuous data matrices
  discrete_X = np.array(discrete)
  cont_X = np.array(cont)

  # Impute discrete values
  imp = Imputer(strategy='most_frequent')
  discrete_X = imp.fit_transform(discrete_X)

  # Impute continuous values
  imp_c = Imputer(strategy='mean')
  cont_X = imp_c.fit_transform(cont_X)

  # Discrete basis representation
  enc = OneHotEncoder()
  enc.fit(discrete_X)
  discrete_X = enc.transform(discrete_X).toarray()

  # Continuous scaling
  scaler = StandardScaler()
  scaler.fit(cont_X)
  cont_X = scaler.transform(cont_X)

  # Merge to one array
  X = np.concatenate((discrete_X, cont_X), axis=1)
  return X
Example #2
0
def preprocess(data, feat_type):
    # replace missing value by most common if categorical and by mean if numerical
    try:
        if data.getformat()=='csr':
            return data
    except:
        print feat_type
        # separate numerical and categorical columns
        idx_num = [i for i in xrange(len(feat_type)) if feat_type[i] == 'Numerical']
        data_num = data[:,idx_num]
        idx_cat = [i for i in xrange(len(feat_type)) if feat_type[i] == 'Categorical']
        data_cat = data[:,idx_cat]
        # fill missing values
        imp_num = Imputer(axis = 0)
        data_num = imp_num.fit_transform(data_num)
        imp_cat = Imputer(axis = 0, strategy='most_frequent')
        data_cat = imp_cat.fit_transform(data_cat)
        # retrieve mean and divide by standard deviation
        data_num = scale(data_num)
        # one-hot encode using pandas
        # have to do it column by column because of pandas
        data_cat_pd = pd.DataFrame(data_cat)
        for i in xrange(data_cat.shape[1]):
            data_cat_pd = pd.concat((data_cat_pd, pd.get_dummies(data_cat[:,i])),join = 'outer', axis = 1)
        # delete the columns that have been one hot encoded; need to rename first,
        # otherwise some columns may be suppressed unwillingly
        data_cat_pd.columns = [i for i in xrange(data_cat_pd.shape[1])]
        data_cat_pd = data_cat_pd.drop(data_cat_pd.iloc[:,[i for i in xrange(data_cat.shape[1])]],axis =1)
        data_cat = np.asarray(data_cat_pd)

        # regroup categorical and numerical variables
        return np.hstack((data_num,data_cat))
    def predict(self, raw_array, results, aux_data_a_d=None, diff=False,
                holdout_col=0, lag=1, positive_control=False, **kwargs):
        """ Given the input results model, predicts the year of data immediately succeeding the last year of the input array. Axis 0 indexes observations (schools) and axis 1 indexes years. For holdout_col>0, the last holdout_col years of data will be withheld from the prediction, which is ideal for finding the error of the algorithm. """

        if positive_control:
            if holdout_col > 0:
                if diff:
                    if holdout_col == 1:
                        control_array = np.diff(raw_array[:, -2:],
                                    1, axis=1)
                    else:
                        control_array = \
                            np.diff(raw_array[:, -holdout_col-1:-holdout_col+1],
                                    1, axis=1)
                else:
                    control_array = raw_array[:, -holdout_col]
            else:
                control_array = np.random.randn(raw_array.shape[0], 1)

        if holdout_col > 0:
            raw_array = raw_array[:, :-holdout_col]
        prediction_raw_array = raw_array
        if diff:
            array = np.diff(raw_array, 1, axis=1)
            X = array[:, -lag:]
            if positive_control:
                X = np.concatenate((X, control_array.reshape(-1, 1)), axis=1)
            if aux_data_a_d:
                for feature_s in aux_data_a_d.iterkeys():
                    if holdout_col > 0:
                        raw_array = aux_data_a_d[feature_s][:, :-holdout_col]
                    else:
                        raw_array = aux_data_a_d[feature_s]
                    array = np.diff(raw_array, 1, axis=1)
                    X = np.concatenate((X, array[:, -lag:]), axis=1)
            estimatorX = Imputer(axis=0)
            X = estimatorX.fit_transform(X)
            predicted_change_a = results.predict(X)
            estimator_orig = Imputer(axis=0)
            orig_a = estimator_orig.fit_transform(prediction_raw_array[:, -1].reshape(-1,1))
            prediction_a = orig_a + predicted_change_a.reshape(-1, 1)
        else:
            array = raw_array
            X = array[:, -lag:]
            if positive_control:
                X = np.concatenate((X, control_array.reshape(-1, 1)), axis=1)
            if aux_data_a_d:
                for feature_s in aux_data_a_d.iterkeys():
                    if holdout_col > 0:
                        raw_array = aux_data_a_d[feature_s][:, :-holdout_col]
                    else:
                        raw_array = aux_data_a_d[feature_s]
                    array = raw_array
                    X = np.concatenate((X, array[:, -lag:]), axis=1)
            estimatorX = Imputer(axis=0)
            X = estimatorX.fit_transform(X)
            prediction_a = results.predict(X)

        return prediction_a.reshape((-1, 1))
Example #4
0
def fill_missing_values(_df, dis_features, cont_features):
    # for discrete features we will use 'most_frequent' strategy
    imp_discrete = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
    _df[dis_features] = imp_discrete.fit_transform(_df[dis_features].values)

    # for continuous features we will use 'mean' strategy
    imp_continuous = Imputer(missing_values='NaN', strategy='mean', axis=0)
    _df[cont_features] = imp_continuous.fit_transform(_df[cont_features].values)
    return _df
Example #5
0
def main():

    weather, train, spray, test = load_data()
    target = train.WnvPresent.values
    idcol = test.Id.values

    weather = wnvutils.clean_weather(weather)

    train = wnvutils.clean_train_test(train)
    test = wnvutils.clean_train_test(test)

    train, test = wnvutils.clean_train_test2(train, test)

    train = train.merge(weather, on="Date")
    test = test.merge(weather, on="Date")

    train.drop("Date", axis=1, inplace=True)
    test.drop("Date", axis=1, inplace=True)

    desc_df(train)

    train = train.ix[:, pd.notnull(train).any(axis=0)]
    test = test.ix[:, pd.notnull(test).any(axis=0)]

    def min_dist_to_spray_(x):
        return wnvutils.min_dist_to_spray(x.Latitude, x.Longitude, spray)

    train["DistToSpray"] = train.apply(min_dist_to_spray_, axis=1)
    test["DistToSpray"] = test.apply(min_dist_to_spray_, axis=1)

    desc_df(train)

    imputer = Imputer()
    traina = imputer.fit_transform(train)
    testa = imputer.fit_transform(test)

    training = np.random.choice([True, False], size=train.shape[0], p=[0.8, 0.2])

    rfc = ensemble.RandomForestClassifier() # oob_score=True)
    rfc.fit(traina[training], target[training])
    # print("oob score:", rfc.oob_score_)

    #
    with open("output/feature_imp.txt", "w") as fout:
        for name, imp in sorted(zip(train.columns, rfc.feature_importances_),
                                key=lambda x: x[1], reverse=True):
            print(name, ":", imp)
            print(name, ":", imp, file=fout)

    predictions = rfc.predict(traina[~training])
    print("Accuracy:", (predictions == target[~training]).mean())

    predictions = rfc.predict_proba(traina[~training])
    np.savetxt("/tmp/predictions.txt", predictions[:, 1])

    print(predictions[:,1])
    print("ROC AUC Score:", roc_auc_score(target[~training], predictions[:,1]))
Example #6
0
def test_model(data, stat_as_index, make_vector, model, do_pca=False, target='score'):
    # compile and shit
    print('Compiling stats...')
    fv, sc = [], []
    for year in ['2014', '2015', '2016']:
        f,s = build_fvs(
            data, year, stat_as_index, make_vector, target)
        fv.append(f)
        sc.append(s)

    # Compile into single vectors: Predict 2016 from 2014 and 2015
    fv_train, fv_test = np.vstack(fv[0:2]), fv[2]
    sc_train, sc_test = np.concatenate(sc[0:2]), sc[2]

    # Impute NaNs
    train_nan = np.isnan(fv_train)
    test_nan = np.isnan(fv_test)
    
    for i in range(fv_train.shape[1]):
        if np.isnan(fv_train[0,i]):
            fv_train[0,i] = 0
    for i in range(fv_test.shape[1]):
        if np.isnan(fv_test[0,i]):
            fv_test[0,i] = 0
            
    print('Imputing...')
    if train_nan.any():
        i1 = Imputer()
        fv_train = i1.fit_transform(fv_train)
        #print(i1.statistics_)
    if test_nan.any():
        i2 = Imputer()
        fv_test = i2.fit_transform(fv_test)
        #print(i2.statistics_)

    if do_pca:
        print('Performing PCA...')
        pca = PCA(whiten=True)
        fv_train = pca.fit_transform(fv_train)
        fv_test = pca.transform(fv_test)

    print('Building test/train sets...')
    # Exclude players with missing scores
    train_nan, test_nan = np.isnan(sc_train), np.isnan(sc_test)
    fv_train, sc_train = fv_train[~train_nan], sc_train[~train_nan]
    fv_test, sc_test = fv_test[~test_nan], sc_test[~test_nan]

    print('Building model...')
    # Build model
    mod = model
    mod.fit(fv_train, sc_train)
    
    print('Predicting output...')
    # kluge to allow for classifier and regressor evaluation
    try: pred = mod.predict_proba(fv_test)
    except: pred = mod.predict(fv_test)
    return pred, sc_test, mod
def fillData(trainFeatures, testFeatures, missing_values=np.NaN, strategy='mean', axis=0, verbose=0, copy=True, all = True):
    imp = Imputer(missing_values, strategy, axis, verbose, copy) 
    if all:
        trainCount = len(trainFeatures)
        full = np.vstack((trainFeatures, testFeatures))
        full = imp.fit_transform(full)
        trainFeatures, testFeatures = np.array(full[:trainCount]), np.array(full[trainCount:])
        return trainFeatures, testFeatures
    else:
        return imp.fit_transform(trainFeatures), imp.fit_transform(testFeatures)
def fill_missing_imputation(electionsData, most_frequent):

    most_frequent = electionsData.columns.intersection(most_frequent)

    im = Imputer(strategy="most_frequent")
    electionsData[most_frequent] = im.fit_transform(electionsData[most_frequent])

    #Fill all of the rest (numeric) using mean
    im = Imputer(strategy="median")
    electionsData[:] = im.fit_transform(electionsData[:])
Example #9
0
def imputing_most_frequent(dataset):
    '''

    :param dataset: pandas DataFrame dataset. 
    :return: The same dataset where the missing values are replaced with the column's most common value
    '''

    imp = Imputer(missing_values='NaN', strategy='most_frequent', copy=False)
    imp.fit_transform(dataset)
    return dataset
Example #10
0
    def test_imputation_shape(self):
        """Verify the shapes of the imputed matrix for different strategies."""
        X = np.random.randn(10, 2)
        X[::2] = np.nan

        for strategy in ['mean', 'median', 'most_frequent']:
            imputer = Imputer(strategy=strategy)
            X_imputed = imputer.fit_transform(X)
            assert_equal(X_imputed.shape, (10, 2))
            X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
            assert_equal(X_imputed.shape, (10, 2))
Example #11
0
 def preprocess(self):
   # impute missing values
   true_ids = set([urlid for urlid, label in self.target.iteritems() if label])
   true_data = [v for k, v in self.data.iteritems() if k in true_ids]
   false_data = [v for k, v in self.data.iteritems() if k not in true_ids]
   self.target = [1 for x in xrange(len(true_data))] + [0 for x in xrange(len(false_data))]
   imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
   true_data = imp.fit_transform(true_data)
   false_data = imp.fit_transform(false_data)
   self.data = np.concatenate((true_data, false_data), axis=0)
   self.test_data = imp.fit_transform(self.test_data.values())
Example #12
0
 def median_impute(self):
     """
     impute
     """
     tr = HFile(self.trfile)
     te = HFile(self.tefile)
     self.attributes = tr.attributes
     self.class_index = tr.class_index
     imp = Imputer(missing_values=-1, strategy='median')
     self.tr = imp.fit_transform(tr.data)
     self.ta = tr.classes
     self.te = imp.fit_transform(te.data)
def solve_missing_values(data):
    """
    Solve missing values
    Parameters
    ----------
    data: Values to remove missing values
    """
    from sklearn.preprocessing import Imputer

    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit_transform(data)
    return data
def get_modelKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    # benign_h2o.summary()

    benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values="NaN", strategy="mean", axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2, 7):
        # Log.info("H2O K-Means")
        km_h2o = H2OKMeansEstimator(k=i)
        km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o)
        km_h2o.show()
        model = h2o.get_model(km_h2o._id)
        model.show()

        km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_
Example #15
0
def get_features(frame):
    '''
    Transforms and scales the input data and returns a numpy array that
    is suitable for use with scikit-learn.

    Note that in unsupervised learning there are no labels.
    '''

    # Replace missing values with 0.0
    # or we can use scikit-learn to calculate missing values below
    #frame[frame.isnull()] = 0.0

    # Convert values to floats
    arr = np.array(frame, dtype=np.float)

    # Impute missing values from the mean of their entire column
    from sklearn.preprocessing import Imputer
    imputer = Imputer(strategy='mean')
    arr = imputer.fit_transform(arr)
    
    # Normalize the entire data set to mean=0.0 and variance=1.0
    from sklearn.preprocessing import scale
    arr = scale(arr)

    return arr
Example #16
0
def gettestdata(fil) :
	data = np.genfromtxt(fil,delimiter=',')
	imp = Imputer(missing_values='NaN', strategy='median', axis=0)
	X = imp.fit_transform(data[:,2:])
	X = scale(X).copy()
	#spr.eliminate_zeros()
	return np.array(X)
Example #17
0
def benignKmeans():
  # Connect to a pre-existing cluster
  # connect to localhost:54321


  #  Log.info("Importing benign.csv data...\n")
  benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
  #benign_h2o.summary()

  benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
  # Impute missing values with column mean
  imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
  benign_sci = imp.fit_transform(benign_sci)

  # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))

  from h2o.estimators.kmeans import H2OKMeansEstimator

  for i in range(1,7):
    benign_h2o_km = H2OKMeansEstimator(k=i)
    benign_h2o_km.train(x = range(benign_h2o.ncol), training_frame=benign_h2o)
    print "H2O centers"
    print benign_h2o_km.centers()

    benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1)
    benign_sci_km.fit(benign_sci)
    print "sckit centers"
    print benign_sci_km.cluster_centers_
def run_whole_video(exp_folder, lims_ID):
    #initializes video pointer for video of interest based on lims ID
    file_string = get_file_string(exp_folder, lims_ID)
    video_pointer = cv2.VideoCapture(file_string)

    # import wheel data
    wheel = joblib.load('dxds2.pkl')
    first_non_nan = next(x for x in wheel if not isnan(x))
    first_index = np.where(wheel == first_non_nan)[0]
    k = first_index[0]
    imp = Imputer(missing_values='NaN', strategy='mean')
    wheel = imp.fit_transform(wheel)
    wheel = preprocessing.MinMaxScaler((-1, 1)).fit(wheel).transform(wheel)

    # self.video_pointer.set(1, 41000)
    ret, frame = video_pointer.read()

    # crops and converts frame into desired format
    frame = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)

    prvs = frame
    nex = frame

    # initialize vectors to keep track of data
    count = 0
    mod = 0
    opticals = []
    angles = []
    frames = []

    # length of movie
    limit = int(video_pointer.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT))


    # create hdf file
    hf = h5py.File('data_' + str(lims_ID) + '.h5', 'w')
    g = hf.create_group('feature space')
    vector = np.zeros((limit, 4321))
    table = g.create_dataset('features', data = vector, shape =(limit, 4321))


    while count <= limit:

        prvs = nex
        frames = process_input(prvs)

        ret, frame = video_pointer.read()
        nex = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)

        optical = optical_flow(prvs, nex)
        opticals = optical['mag']
        angles= optical['ang']
        vector_data = np.concatenate((np.reshape(wheel[k], (1)), frames, opticals, angles))

        table[count, :] = vector_data

        count += 1

        if count%1000 == 0:
            print (count)
Example #19
0
def learn():
	global classifier, INPUT
	print 1
	data = np.genfromtxt(INPUT, delimiter=' ', dtype='f8')
	np.random.shuffle(data)
	n = len(data)
	y = data[:,1]
	x = data[:][:,range(2,54)]
	# test_x = []
	# test_y = []
	train_x = []
	train_y = []
	print 2
	imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
	x = imp.fit_transform(x)
	print 3
	for i in range(0, n):
		if y[i] == 0:
			continue
		train_x.append(x[i])
		train_y.append(y[i])
		# if i%100==0:
		# 	test_x.append(x[i])
		# 	test_y.append(y[i])
		# else:
		# 	train_x.append(x[i])
		# 	train_y.append(y[i])
	print 4
	classifier.fit(train_x, train_y)
	print 5
Example #20
0
def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df
Example #21
0
def preprocess(data):

    non_sparse_only = True
    use_all_category_only = False
    use_all_impute_mean_mode = False


    if non_sparse_only:
        nominal_samples = data.ix[:,['var4','dummy']] 
        onehot_samples = onehot.transform(nominal_samples,['var4','dummy'])
        onehot_samples = pd.DataFrame(onehot_samples.toarray())
        numbered_samples = data.ix[:,['var7','var8','var10','var11','var13','var15','var17']]
        numbered_samples[['var7','var8']] = numbered_samples[['var7','var8']].convert_objects(convert_numeric=True)
        #(var7 and 8 are ordinal, converting to floats which includes NaNs will allow mean imputing of missing values)
        other_samples = data.ix[:,'crimeVar1':'weatherVar236'] #all the continuous vars
        other_samples = other_samples.drop(['weatherVar115'], axis=1) #nothing in this feature
        samples = pd.concat([onehot_samples,numbered_samples,other_samples],axis=1) #combine w/ the cleaned up other vars
        imp_nan = Imputer(missing_values=np.nan, strategy='mean', axis=0)
        samples_imp = imp_nan.fit_transform(samples)
    
    if use_all_category_only:
        todo
    
    if use_all_impute_mean_mode:
        todo
    
    return samples_imp
Example #22
0
    def test_3_stage(self):
        from sklearn.preprocessing import Imputer

        infile_name = path_of_data('missing_vals.csv')

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        impute_node = p.add(wrap_and_make_instance(Imputer))

        csv_read_node['output'] > impute_node['X_train']
        impute_node['X_new'] > csv_write_node['input']

        self.run_pipeline(p)

        ctrl_imputer = Imputer()
        ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",",
                                  names=True)
        num_type = ctrl_X_sa[0][0].dtype
        ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
        ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
        control = ctrl_X_new_nd

        result = self._tmp_files.csv_read('out.csv', True)

        self.assertTrue(np.allclose(result, control))
Example #23
0
    def fill_and_remove(self, s_strategy="zeros", l_features = False, 
        b_remove = True):
        '''
        fill all Nan values in numerical data with zeros and then remove data 
        points that all features are equal to zero
        l_features: a list of features to be tested. If any, all features will 
        be used
        b_remove: boolean indicating if should remove keys where all data is 0
        s_strategy: string with the strategy used to fill NaNs. Can be "mean",
        "median" and "zeros"
        '''
        df = self.getData()
        #pre-process data
        if not l_features:
            l_features = self.payments_features + self.stock_features 
            l_features+= self.email_features
        df.loc[:, l_features] = df.loc[:, l_features].astype(float)
        #filling Nan with the strategy selected
        if s_strategy == "zeros":
            df.loc[:, l_features] = df.loc[:, l_features].fillna(0)
        else:
            na_X = df.loc[:, l_features].values
            imp = Imputer(missing_values='NaN', strategy=s_strategy, axis=0)
            df.loc[:, l_features] = imp.fit_transform(na_X)

        #exclude datapoint where every number is equal to 0
        if b_remove:
            df = df.ix[((df.loc[:, l_features]!=0).sum(axis=1)!=0),:]
        #saving the new dataframe       
        self.setData(df)
        #correct scaled df
        if type(self.df_scaled)!=list:
            df2 = self.df_scaled
            df2 = df2.ix[((df.loc[:, l_features]!=0).sum(axis=1)!=0).index,:]
            self.df_scaled = df2             
Example #24
0
 def Train_And_Test(self):
     HOG_data=np.loadtxt('dataset.csv',delimiter=",")
     tmpdata=HOG_data[:,0:-2]
     target=HOG_data[:,-2]
     print(target)
     tmpdata[tmpdata==0]=np.nan
     imp=Imputer(missing_values='NaN',strategy='mean')
     data=imp.fit_transform(tmpdata)
     data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.3)
     model=SVC(C=1.0,gamma=0.0,kernel='linear', class_weight='auto')
     model.fit(data_train,target_train)
     print(data_train)
     print(target_train)    
     opencv_data_train=np.float32(data_train)
     opencv_target_train=np.float32(target_train)     
     svm_params = dict( kernel_type = cv2.SVM_LINEAR,
                 svm_type = cv2.SVM_C_SVC,
                 C=2.67, gamma=5.383)
     svm = cv2.SVM()
     svm.train(opencv_data_train,opencv_target_train, params=svm_params)
     svm.save("hog_classifier.xml")  
     print(model)
     expected=target_test
     predicted=model.predict(data_test)
     target_names = ['Not Human', 'Human']
     
     print(metrics.classification_report(expected,predicted,target_names=target_names))
     print(metrics.confusion_matrix(expected,predicted))
     print(metrics.roc_curve(expected,predicted))
     pickle.dump(model, open( "svm.p", "wb" ) )
Example #25
0
def impute_missing_train(dataframe, missing_values='NaN', strategy='mean'):
    '''
    Given a dataframe, imputes missing values with a given strategy.
    Supported strategies: 'mean', 'median', 'most_frequent'.
    Returns dictionary mapping transformed columns to its imputer value.
    '''
    from sklearn.preprocessing import Imputer
    imp = Imputer(missing_values=missing_values, strategy=strategy, axis=0)
    imputed = imp.fit_transform(dataframe)
    df = pd.DataFrame(imputed)
    df.columns = list(dataframe.columns)
    
    imputers = {}
    if strategy == 'mean':
        for col in df.columns:
            mean = df[col].mean()
            imputers[col] = mean
    if strategy == 'median':
        for col in df.columns:
            median = df[col].median()
            imputers[col] = median
    if strategy == 'most_frequent':
        for col in df.columns:
            mode = df[col].mode()
            imputers[col] = mode
    return df, imputers
def run_clfList(clfList, stringList="", normalize=False):
    """
    Run 100-fold 80/20 cross-validation on each classifier in clfList
    print the average AUC for each classifier
    :param clfList: list of classifiers to run
    :param stringList: names of the classifiers
    :param normalize: whether or not to normalize the data
    :return: the average AUC for each classifier in clfList
    """
    # data, labels = six_features(force=False)
    # data, labels = six_and_time_features(force=False)
    # data, labels = five_features(force=False)
    # data, labels = five_and_rts(force=False)
    data, labels = new_features()
    if normalize:
        data = normalize_data(data)

    imp = Imputer(missing_values=np.NaN, strategy="mean")
    data = imp.fit_transform(data)

    # Cross-validate all clfs 100 times
    means = kfoldcvList(data, labels, clfList, 100)
    if stringList == "":
        stringList = ["" for i in range(len(labels))]

    # Print out the mean AUCs
    for i, mean in enumerate(means):
        print stringList[i]+": "+str(mean)

    for mean in means:
        sys.stdout.write(str(mean) + " & ")
    sys.stdout.write("\n")
    return means
def plot_ROCList(clfList, data, labels, stringList=""):
    """
    Plot an ROC curve for each classifier in clfList, training on a single 80/20 split
    :param clfList:
    :param data:
    :param labels:
    :param stringList:
    :return:
    """
    if stringList == "":
        stringList = ["" for i in range(len(labels))]
    imp = Imputer(missing_values=np.NaN, strategy="mean")
    data = imp.fit_transform(data)

    # Cross-validate on the data once using each model to get a ROC curve
    AUCs, fprs, tprs, threshs = cvList(data, labels, clfList)

    # Plote a ROC for each clf in clfList
    for i in range(len(clfList)):
        fpr = fprs[i]
        tpr = tprs[i]
        plt.plot(fpr, tpr)
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(stringList[i]+" ROC Curve, AUC = "+str(AUCs[i]))
        plt.savefig(stringList[i]+"_ROC.png")
        plt.close()
        print stringList[i] + ":" + str(AUCs[i])
def run_importance(clf, data, labels, feature_labels=[""], string=""):
    """
    Fit a classifier using all the data and plot the feature importances
    :param clf: Classifier object that has feature_importances_ member
    :param feature_labels: names of the features
    :param string: classifier name
    :return: (void) plot Gini importance vs feature
    """
    num_features = data.shape[1]
    importances = [0]*num_features

    imp = Imputer(missing_values=np.NaN, strategy="mean")
    data = imp.fit_transform(data)

    # run the classifier 100 times and average the importance found after each fit
    for r in range(100):
        clf.fit(data, labels)
        importances = [importances[i]+clf.feature_importances_[i] for i in range(num_features)]
    importances = [importance/100 for importance in importances]

    # Filter out the features that have 0 importance (e.g. values are all 0)
    # non_zeros are the indices in feature_importances that are not 0
    non_zeros = [i for i in range(num_features) if not importances[i] == 0]
    importances = [importances[i] for i in non_zeros]
    feature_labels = [feature_labels[i] for i in non_zeros]

    # Plot the features
    bar_width = 0.7
    plt.bar(range(len(feature_labels)), importances, bar_width)
    plt.xticks([ind + +float(bar_width)/2 for ind in range(len(feature_labels))], feature_labels,rotation="vertical")
    plt.gcf().subplots_adjust(bottom=0.35)
    plt.xlabel("Feature")
    plt.ylabel("Gini Importance")
    plt.title("Gini Importance v. Features for "+string+" Classifier")
    plt.show()
def avg_message_count_by_group(df_users, df_messages, df_user_features):
    
    columns = ["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10"]
 
    features = df_user_features[list(columns)].values

    # Impute missing values to retain all sample data
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    X = imp.fit_transform(features)

    # Preprocess dataset and standardize features to have normally distributed data
    # MaxAbsScaler allows scaled features to lie between -1 and +1
    X = MaxAbsScaler().fit_transform(X)

    # Apply PCA decomposition and use first 3 components that explain 75% of variance
    reduced_data = decomposition.PCA(n_components=3).fit_transform(X)
    kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
    
    # Predict which group each user belongs to
    cluster_labels = kmeans.fit_predict(reduced_data)    
    df_user_features['group.id'] = cluster_labels
    
    # Call utility function to join the two dataframes
    df_joined_users_messages = get_merged_dataframes(df_users, df_messages)
    df_joined_users_messages_features = get_merged_dataframes(df_user_features, df_joined_users_messages)
      
    # Only keep messages that were received since signing up
    df_joined_users_messages_features = df_joined_users_messages_features[df_joined_users_messages_features['message.date'] 
                                                                          >= df_joined_users_messages_features['signup.date']]
        
    # Get the average message count grouped by group.id
    avg_message_count = df_joined_users_messages_features.groupby('group.id')['message.count'].mean()
    
    # Return the average message count grouped by user groups and rounded to 2 decimals
    return np.round(avg_message_count.tolist(), decimals=2)
def impute_missing_data(datapoints, strategy='mean'):
    """ Inputes values for the 8 features missing data

    Arguments:
    datapoints -- X, a dataset with missing values represented 999.0 and 9999.0
    strategy [optional] -- an imputation strategy,
        e.g., mean, median, or most_frequent

    Returns:
    X_imputed -- a dataset with missing values imputed according to the
        provided or default (mean) strategy.

    Uses the scikit-learn Imputer class.
    """
    # First we will replace our placeholder values with NaN to only have
    # to run one imputation.
    np.putmask(datapoints, datapoints == 999.0, np.NaN)
    np.putmask(datapoints, datapoints == 9999.0, np.NaN)

    # Now create an imputer over NaN values, and average over axis=0 (columns)
    # Then, fit the imputer to the dataset.
    imp = Imputer(strategy=strategy, axis=0)
    X_imputed = imp.fit_transform(datapoints)

    return X_imputed
Example #31
0
def read_split_aug(filepath, filename, rmv, finalNames):
    #read the csv
    try:
        dataset = genfromtxt(open(filepath + '/' + filename, 'rb'),
                             delimiter=',',
                             dtype='f8')[0:]

        # Clean the dataset
        # Sort the observations according to the timestamp
        dataset = dataset[dataset[:, 0].argsort()]
        dataset = dataset[12:, :]  #exclude some nan observations
        dataset = dataset[0:360, :]  #exclude some nan observations

        # Remove redundant resources
        dataset = np.delete(dataset, np.s_[rmv], axis=1)
        target = dataset[:, 1865]  #values of the target variable
        tt = target[:, np.newaxis]
        rm_dataset = np.delete(dataset, np.s_[2453:2455], axis=1)  #exclude VOD
        rm_dataset = np.delete(rm_dataset, np.s_[1863:1869],
                               axis=1)  #exclude NDVI
        rm_dataset = np.delete(rm_dataset, np.s_[1849:1853],
                               axis=1)  #exclude VOD
        dataset = np.concatenate((rm_dataset, tt),
                                 axis=1)  #put the target column in the end
        dataset = DataFrame(dataset)

        dataset = dataset.fillna(0)
        dataset.columns = finalNames.ravel()
        names = dataset.columns[3:dataset.shape[1]]

        # Creat the new dataset
        X = dataset.iloc[:, 3:dataset.shape[1] - 1]
        y = dataset.iloc[:, dataset.shape[1] - 1]
        #  import the lags of NDVI (target)
        win = 13
        new_datasetAuto = np.empty((len(y), win))
        for i in range(1, win):
            new_datasetAuto[:, i - 1] = shift2(y, i)  #, cval=np.NaN)
        new_datasetAuto[:, win - 1] = y

        # Imputer the missing values with the mean
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        dataImputedAuto = imp.fit_transform(new_datasetAuto)
        X1 = dataImputedAuto[:, 0:dataImputedAuto.shape[1] - 1]
        X = np.concatenate((X, X1), axis=1)
        new_dataset = np.concatenate((X, DataFrame(y)), axis=1)
        new_dataset = DataFrame(new_dataset)

        # Imputer the missing values with zero
        new_dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
        new_dataset = new_dataset.fillna(0)

        predictor_names = names[0:len(names) - 1].tolist()
        target_name = names[len(names) - 1]
        for i in range(1, 13):
            predictor_names.append(target_name + ` i `)
        predictor_names.append(target_name)
        predictor_names = np.array(predictor_names)
        new_dataset.columns = predictor_names.ravel()

        return new_dataset
    except IOError as e:  #if the file does not exist throw an exception
        #print e
        return []
        pass
print(recall)
precision = precision_score(realclass, predict, average='weighted')
print(precision)

# In[5]:

from pandas import read_csv
from sklearn.preprocessing import Imputer
import numpy
dataset = read_csv('/home/ajit/Downloads/hepatitis_csv.csv', header=None)
# mark zero values as missing or NaN
dataset[[1, 2, 3, 4, 5]] = dataset[[1, 2, 3, 4, 5]].replace(0, numpy.NaN)
# fill missing values with mean column values
values = dataset.values
imputer = Imputer()
transformed_values = imputer.fit_transform(values)
# count the number of NaN values in each column
print(numpy.isnan(transformed_values).sum())

# In[47]:

#logistic_regression

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import optimize as op
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# In[275]:

df = pd.read_csv("./data/final_project_dataset.csv")

# In[276]:

ndf = df.drop(["Unnamed: 0", "email_address", "poi"], axis=1)
#exclude_features=["director_fees","loan_advances","restricted_stock_deferred"]
exclude_features = []
ndf = ndf.drop(exclude_features, axis=1)
dfmtx = ndf.values
dfmtx.astype(float)
label = df["poi"].values
# Fill in NaN
imp = Imputer(axis=0, strategy="median")
ndfmtx = imp.fit_transform(dfmtx)

# ## Use random forest to select feature

# In[277]:

from sklearn.ensemble import RandomForestClassifier
train_X = ndfmtx
train_y = label
rf = RandomForestClassifier()
rf.fit(train_X, train_y)
rfi = rf.feature_importances_


def list_feature_imp(name, score):
    sorted_rfi_idx = np.argsort(score)
Example #34
0
                    'BMI', 'WEIGHT', 'WEIGHT', 'LV_MASS_INDEX']
#check for missing or NaN in the dataset:
pd.isnull(dat_main).sum() > 0

#dataset for analysis outcome = chf 60
dat_chf = dat_main[list_my_features].copy() #the copy() is important to create new dataframe
dat_chf.head()
describe = dat_chf.describe() #function for easy descriptive statistics

# A simple way to fill na (one by one)
median_glu = dat_chf['GLUCOSE'].median()
dat_chf['GLUCOSE'] = dat_chf['GLUCOSE'].fillna(median_glu) 

#imputer sklearn (better option to fill missing values) for all df
imputer = Imputer(strategy = 'median', axis = 1)
dat_chf = pd.DataFrame(imputer.fit_transform(dat_chf), columns = dat_chf.columns) # the imputation

pd.isnull(dat_chf).sum() > 0

#descriptive statistics (table 1)
columns = ["age", "gender", "ECHO1_ef", "pre_MI","pre_DM"]
categorical = ['gender', 'pre_MI', 'pre_DM']
groupby = ["CHF60"]
labels={'ECHO1_ef': 'Ejection fraction',
        'pre_MI': 'Previous MI',
        'CHF60' : 'CHF 60 days'}
mytable = TableOne(dat_chf, columns = columns,
                   categorical = categorical,
                   groupby = groupby,
                   labels = labels,
                   isnull = True, remarks = False, pval = True)
y = home_data.SalePrice
train = home_data.drop([
    'SalePrice', 'EnclosedPorch', 'LowQualFinSF', 'MiscVal', 'OpenPorchSF',
    'PoolArea', 'ScreenPorch'
],
                       axis=1)
test = test_data.drop([
    'EnclosedPorch', 'LowQualFinSF', 'MiscVal', 'OpenPorchSF', 'PoolArea',
    'ScreenPorch'
],
                      axis=1)

one_hot_encoded_training_predictors = pd.get_dummies(train)
one_hot_encoded_test_predictors = pd.get_dummies(test)
train, test = one_hot_encoded_training_predictors.align(
    one_hot_encoded_test_predictors, join='left', axis=1)

my_imputer = Imputer()
train = my_imputer.fit_transform(train)
test = my_imputer.transform(test)

model = XGBRegressor()
model.fit(train, y)
preds = model.predict(test)

# outputting

pd.DataFrame({
    'Id': test_data.Id,
    'SalePrice': preds
}).to_csv('submission.csv', index=False)
                                   isin(list(CONTINUOUS_KEYS))]
categorical_vars = observations.loc[:,
                                    observations.columns.
                                    isin(list(CATEGORICAL_KEYS))]
categorical_vars_imputed = convert_to_categorical(observations,
                                                  CATEGORICAL_KEYS,
                                                  cat_only=True,
                                                  key_var='ID')

#combine continuous and categorical variables

#add commented-out Random Forest Classifier Code Below Here

#Fit a gaussian NB on continuous vars
cont_imputer = Imputer(strategy='mean', axis=1, copy=False)
imputed_continuous_vars = cont_imputer.fit_transform(continuous_vars)
gauss_nb = GaussianNB()
continuous_predictions = cross_val_predict(gauss_nb,
                                           imputed_continuous_vars,
                                           target,
                                           cv=10)
output_metrics("Continuous NB", target, continuous_predictions)

#Fit multinomial NB on categorical vars

mult_nb = MultinomialNB()
categorical_predictions = cross_val_predict(mult_nb,
                                            categorical_vars,
                                            target,
                                            cv=10)
output_metrics("Categorical NB", target, categorical_predictions)
Example #37
0
data = data.drop("FireplaceQu", 1)

all_columns = data.columns.values
non_categorical = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal"
]

categorical = [value for value in all_columns if value not in non_categorical]
#  One Hot Encoding and nan transformation
data = pd.get_dummies(data)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
data = imp.fit_transform(data)

# Log transformation
data = np.log(data)
labels = np.log(labels)

# Change -inf to 0 again
data[data == -np.inf] = 0
# Split traing and test
data_offset = np.average(data, axis=0)
data -= data_offset

labels_offset = np.average(labels, axis=0)
labels -= labels_offset

train = data[:1460]
Example #38
0
import statsmodels.formula.api as sm
from sklearn.preprocessing import MinMaxScaler

veriler = pd.read_csv("veriler_nf.csv")
Nhdort = veriler[["NH4N"]]
ALF = veriler[["Averageleachateflow"]]
SS = veriler[["SS"]]
Sicaklik = veriler[["Temperature"]]
Cod = veriler[["COD"]]
MLSSAero = veriler[["MLSSaerobic"]]
Nnf = veriler[["NNF"]]
Codnf = veriler[["CODNF"]]
print(veriler)
imputer = Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
SS = imputer.fit_transform(SS)
MLSSAero = imputer.fit_transform(MLSSAero)

SS = pd.DataFrame(data=SS, index=range(275), columns=['SS'])
MLSSAero = pd.DataFrame(data=MLSSAero, index=range(275), columns=['MLSSAero'])

scaler = MinMaxScaler()
Cod = scaler.fit_transform(Cod)
Nhdort = scaler.fit_transform(Nhdort)
SS = scaler.fit_transform(SS)
Sicaklik = scaler.fit_transform(Sicaklik)
MLSSAero = scaler.fit_transform(MLSSAero)
ALF = scaler.fit_transform(ALF)
Nnf = scaler.fit_transform(Nnf)
Codnf = scaler.fit_transform(Codnf)
Example #39
0
from pandas import DataFrame
hold=pd.DataFrame(hold)
y_pred=pd.DataFrame(y_pred)
hold=pd.concat((hold,y_pred),axis=1)
hold.to_csv('out.csv',index=False)
'''

X = train.values
y = y.values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.667)

from sklearn.preprocessing import Imputer
imp = Imputer()
X_train = imp.fit_transform(X_train, y_train)
X_test = imp.transform(X_test)

from sklearn.preprocessing import StandardScaler
s = StandardScaler()
X_train = s.fit_transform(X_train, y)
X_test = s.transform(X_test)

from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

regressor = KNeighborsClassifier(n_neighbors=10)
Example #40
0
## ook hier weer de MAE. Uiteindelijk pakt XGBoosting dus heel veel modellen en kan
## met deze bak aan informatie heel precieze modellen maken. XGBoosting is daarom
## een stuk preciezer. Je doet het als volgt:

# 1) eerst laad je de data, dealt met missende data
#       en  breek je de data op in train en test data
data = pd.read_csv('train.csv')
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(),
                                                    y.as_matrix(),
                                                    test_size=0.25)

my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)

# 2) Net als in het sklearn pakket bouwen we het model, nu dus
#       het naieve model

from xgboost import XGBRegressor
my_model = XGBRegressor()
# met silent=True zorg je dat niet alle cycle data wordt uitgeprint
my_model.fit(train_X, train_y, verbose=False)

# 3) Net als eers laten we het model voorspellingen maken en beoordelen
#    we het model op basis van MAE
# make predictions
predictions = my_model.predict(test_X)
from sklearn.metrics import mean_absolute_error
#print(X_train.columns)
X_train[feats_cat] = X_train[feats_cat].astype(object)
X_train = pd.get_dummies(X_train, dummy_na= True)
#print(X_train.columns)

#print(X_test.columns)
X_test[feats_cat] = X_test[feats_cat].astype(object)
X_test = pd.get_dummies(X_test, dummy_na= True)
#print(X_test.columns)


# fillna
from sklearn.preprocessing import Imputer
fillnan= Imputer()
X_train= fillnan.fit_transform(X_train)
fillnan= Imputer()
X_test=  fillnan.fit_transform(X_test)



##############################     PARS SEARCH     ##############################
def gridSearch(X, y):
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import make_scorer
    from sklearn.preprocessing import Imputer
    from sklearn.model_selection import ShuffleSplit
    from numpy.random import randint
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
Example #42
0
dataset = pd.read_csv('dataset/sal.csv',
                      names=[
                          'age', 'workclass', 'fnlwgt', 'education',
                          'education-num', 'marital-status', 'occupation',
                          'relationship', 'race', 'gender', 'capital-gain',
                          'capital-loss', 'hours-per-week', 'native-country',
                          'salary'
                      ],
                      na_values=' ?')

X = dataset.iloc[:, 0:14].values
y = dataset.iloc[:, -1].values

from sklearn.preprocessing import Imputer
imp = Imputer()
X[:, [0, 2, 4, 10, 11, 12]] = imp.fit_transform(X[:, [0, 2, 4, 10, 11, 12]])

test = pd.DataFrame(X[:, [1, 3, 5, 6, 7, 8, 9, 13]])

test[0].value_counts()
test[1].value_counts()
test[2].value_counts()
test[3].value_counts()
test[4].value_counts()
test[5].value_counts()
test[6].value_counts()
test[7].value_counts()

test[0] = test[0].fillna(' Private')
test[0].value_counts()
def cv_get_mae_imputednans(X,y):
    model = RandomForestRegressor()
    my_imputer = Imputer()
    X_imputed = my_imputer.fit_transform(X)
    mae_avg = -1*cross_val_score(model,X_imputed,y,scoring='neg_mean_absolute_error').mean()
    return(mae_avg)
Example #44
0
# In[ ]:

print(data1.isnull().sum(), data2.isnull().sum())

# We can now use Imputer for Imputing Missing Data

# In[ ]:

from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder
le = LabelEncoder()
x_train['Embarked'] = x_train['Embarked'].fillna('$')
x_train['Embarked'] = le.fit_transform(x_train['Embarked'])
x_train['Cabin'] = le.fit_transform(x_train['Cabin'])
imr = Imputer(missing_values=8, strategy='median', axis=0, copy=False)
x_train[['Cabin']] = imr.fit_transform(x_train[['Cabin']])
imr.set_params(missing_values=np.nan, strategy='mean')
x_train[['Age']] = imr.fit_transform(x_train[['Age']])
imr.set_params(missing_values=3, strategy='most_frequent')
x_train[['Embarked']] = imr.fit_transform(x_train[['Embarked']])
ohe = OneHotEncoder(categorical_features=[1])
x_train['Sex'] = le.fit_transform(x_train['Sex'])

print(x_train.head())

# In[ ]:

fig, ax1 = plt.subplots(figsize=(10, 10))
sns.heatmap(data=x_train.corr(), annot=True, fmt='.1f', linewidths=.1)

# In[ ]:
def clean_data_ML(df1):
    '''
    Cleaning and performing feature extracting and engineering to the dataframe

    
    Input:
        df1 (DataFrame)
       
    Output:
        cleaned_df (DataFrame): cleaned df DataFrame
    '''

    #drops columns with more than 20% of missing values
    print(
        "Drop columns with more than 20% of missing values and Droping unnecessary columns"
    )
    print(
        "droping column EINGEFUEGT_AM and D19_LETZTER_KAUF_BRANCHE because it contain too many different items"
    )

    df1.drop([
        'ALTER_KIND1', 'ALTER_KIND2', 'ALTER_KIND3', 'ALTER_KIND4',
        'EXTSEL992', 'KK_KUNDENTYP', 'RT_KEIN_ANREIZ', 'CJT_TYP_6',
        'D19_VERSI_ONLINE_QUOTE_12', 'CJT_TYP_2', 'EINGEZOGENAM_HH_JAHR',
        'D19_LOTTO', 'CJT_KATALOGNUTZER', 'VK_ZG11', 'UMFELD_ALT',
        'RT_SCHNAEPPCHEN', 'AGER_TYP', 'ALTER_HH',
        'D19_BANKEN_ONLINE_QUOTE_12', 'D19_GESAMT_ONLINE_QUOTE_12',
        'D19_KONSUMTYP', 'D19_VERSAND_ONLINE_QUOTE_12', 'GEBURTSJAHR',
        'KBA05_BAUMAX', 'TITEL_KZ', 'D19_BANKEN_DATUM',
        'D19_BANKEN_OFFLINE_DATUM', 'D19_BANKEN_ONLINE_DATUM',
        'D19_GESAMT_DATUM', 'D19_GESAMT_OFFLINE_DATUM',
        'D19_GESAMT_ONLINE_DATUM', 'D19_TELKO_DATUM',
        'D19_TELKO_OFFLINE_DATUM', 'D19_TELKO_ONLINE_DATUM',
        'D19_VERSAND_DATUM', 'D19_VERSAND_OFFLINE_DATUM',
        'D19_VERSAND_ONLINE_DATUM', 'D19_VERSI_DATUM',
        'D19_VERSI_OFFLINE_DATUM', 'D19_VERSI_ONLINE_DATUM', 'CAMEO_DEU_2015',
        'LP_FAMILIE_FEIN', 'LP_STATUS_FEIN', 'ANREDE_KZ', 'GREEN_AVANTGARDE',
        'SOHO_KZ', 'VERS_TYP', 'LP_LEBENSPHASE_GROB', 'LP_LEBENSPHASE_FEIN',
        'EINGEFUEGT_AM', 'D19_LETZTER_KAUF_BRANCHE', 'CAMEO_INTL_2015',
        'PRAEGENDE_JUGENDJAHRE', 'PLZ8_BAUMAX'
    ],
             axis=1,
             inplace=True)

    print("creating a copy of dataframe")
    df = df1.copy()

    try:
        df.drop(['PRODUCT_GROUP', 'CUSTOMER_GROUP', 'ONLINE_PURCHASE'],
                axis=1,
                inplace=True)
    except:
        pass

    #replace O with 0 and W with 1
    print("Re-encode OST_WEST_KZ attribute")
    df['OST_WEST_KZ'].replace(['O', 'W'], [0, 1], inplace=True)

    #feature engineering Neighbourhood Column with three parts Rural(0), Not Rural(1) and Rural But Good Neighbourhood(2)
    print("Feature engineering WOHLANG")
    df['TYPE_QUALITY_NEIGHBOURHOOD'] = df['WOHNLAGE']
    df['TYPE_QUALITY_NEIGHBOURHOOD'].replace(
        [-1, 0, 1, 2, 3, 4, 5, 7, 8], [np.nan, np.nan, 0, 0, 2, 2, 0, 1, 1],
        inplace=True)

    print("Droping WOHLANG column")
    #delete 'WOHNLAGE'
    df.drop(['WOHNLAGE'], axis=1, inplace=True)

    #change object type of CAMEO_DEUG_2015 to numeric type
    print("Feature extracting CAMEO_DEUG_2015")
    df['CAMEO_DEUG_2015'] = df['CAMEO_DEUG_2015'].apply(
        lambda x: check_value(x))

    #remove columns with kba
    print("remove columns with start with kba")
    kba_cols = df.columns[df.columns.str.startswith('KBA05')]
    df.drop(list(kba_cols), axis='columns', inplace=True)

    #name of column of df that contains XX string
    for i in df.columns:
        df[i].astype('str').apply(lambda x: print(df[i].name)
                                  if x.startswith('XX') else 'pass')

    #imputing nan values
    print("Imputing Nan values")
    imp = Imputer(missing_values=np.nan, strategy='most_frequent')
    df[df.columns] = imp.fit_transform(df)

    print("Counting Nan values", np.isnan(df).sum().sum())

    return df
def preprocessing(configparms, training_preprocessing_flag):

    if training_preprocessing_flag:
        df = pd.read_csv(configparms['training_file'])
    else:
        #empty list since test data does not include target variables
        configparms['target_name_list'] = list()
        df = pd.read_csv(configparms['test_file'])
        df['stabilityVec'] = 0

    reportCMTVEM = list()
    reportCMTVEM.append(
        "---------------------------------------------------------------")
    reportCMTVEM.append("input parameters from configuration file")
    reportCMTVEM.append(configparms)

    #removing chemically/physically meaningless features
    df.drop(list(configparms['remove_features'].values()),
            axis=1,
            inplace=True)

    #revise features that need corrections
    for item in list(configparms['revised_features'].values()):
        feature_name_temp = item.split(',')[0].split()[0]
        element_name_temp = item.split(',')[1].split()[0]
        correct_value_temp = item.split(',')[2].split()[0]
        df.loc[df.formulaA == element_name_temp,
               feature_name_temp] = float(correct_value_temp)
        df.loc[df.formulaB == element_name_temp,
               feature_name_temp] = float(correct_value_temp)

    # revise features that are non-numerical and making categorical features in their stead on the data-frame
    df_added_categoricals = pd.get_dummies(
        df, columns=list(configparms['categorical_features'].values()))
    configparms['categorical_features_fullnames'] = list(
        set(df_added_categoricals.columns) - set(df.columns))
    #copy back
    df = copy.deepcopy(df_added_categoricals)

    #Handling missing values
    #since the zero values in certain features are physically meaningless (eg. bulk modulus) it needs to be changed to NAN
    if configparms['imputation_method'] != 'none':
        for item in list(configparms['impute_features'].values()):
            df[item] = df[item].replace(0, np.nan)

    df_features_data = df.drop(['formulaA', 'formulaB', 'stabilityVec'],
                               axis=1)

    if configparms['imputation_method'] == 'mn':
        imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
        for item in list(configparms['impute_features'].values()):
            df_features_data[item] = imputer.fit_transform(
                df_features_data[item].values.reshape(-1, 1)).ravel()

    #Reflect the changes on the original data frame
    df[df_features_data.columns] = df_features_data[df_features_data.columns]

    if training_preprocessing_flag:
        #if training set is under preprocessing
        #This section is for preparing the target variable for training set only
        for item in list(configparms['element_removal'].values()):
            df = df[df.formulaA.str.contains(item) == False]
            df = df.reset_index(drop=True)

        df['target_vector'] = df['target_vector'].map(
            lambda x: x.lstrip('[').rstrip(']'))
        configparms['target_name_list'] = [
            'target1', 'target2', 'target3', 'target4', 'target5'
        ]
        df[configparms['target_name_list']] = df['target_vector'].str.split(
            ',', expand=True)
        df[configparms['target_name_list']] = df[
            configparms['target_name_list']].astype(np.float)

        #Writes the preprocessed training data into a csv file
        df.to_csv('training_preprocessed.csv')
        output_writer(reportCMTVEM, configparms)

    return (df, configparms)
Example #47
0
 def impute_mem(self, memory):
     imputer = Imputer()
     imputed_memory = imputer.fit_transform(memory)
     return imputed_memory
Example #48
0
def clean_data(df_set, strategy):
    imputer = Imputer(strategy=strategy)
    np_arr = imputer.fit_transform(df_set)
    return pd.DataFrame(np_arr, columns=df_set.columns)
Example #49
0
count_null_embarked = len(train_df['Embarked'][train_df.Embarked.isnull()])
value_to_fill_embarked = train_df['Embarked'].dropna().mode().values
train_df['Embarked'][train_df['Embarked'].isnull()] = value_to_fill_embarked
lb2 = LabelEncoder()
train_df['Embarked'] = lb2.fit_transform(train_df['Embarked'])

# Set the target column to Survived
targets = train_df.Survived

#Dropping unwanted columns. Also, removing the target column.
train_df = train_df.drop(
    ['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId', 'Survived'], axis=1)

#Imputer is used to fill all the occurances of NaN with mean of that column.
im = Imputer()
predictors = im.fit_transform(train_df)

#Using Decision Tree Classifier
classifier = DecisionTreeClassifier(max_depth=3, min_samples_leaf=5)
classifier = classifier.fit(predictors, targets)

#Cleaning test data
#Test data is cleaned in the same way as the training data
lb3 = LabelEncoder()
test_df['Sex'] = lb3.fit_transform(test_df['Sex'])  #male:1, female:0

count_null_embarked = len(test_df.Embarked[test_df.Embarked.isnull()])
value_to_fill_embarked = test_df.Embarked.dropna().mode().values
test_df['Embarked'][test_df.Embarked.isnull()] = value_to_fill_embarked
lb4 = LabelEncoder()
test_df['Embarked'] = lb4.fit_transform(test_df['Embarked'])
Example #50
0
    votes = np.array([[np.argmax(t) for t in c.predict(test_data)]
                      for c in classes])
    winners = np.reshape(mode(votes)[0], -1)
    return winners


data = read_csv(open('train_pca.csv', 'r'), na_values='').as_matrix()

X = data[:, 1:-1]  # input features
Y = data[:, -1].astype('int')  # input features
Y1 = to_categorical(Y)

classes = []

imp = Imputer()  #default arguments will suffice
X = imp.fit_transform(X)

# Dropout(rate, noise_shape=None, seed=None)

i = 0
for train_i, test_i in StratifiedShuffleSplit(n_splits=3,
                                              random_state=None).split(X, Y):
    np.random.shuffle(train_i)
    X_train = X[train_i]
    Y_train = Y1[train_i]
    brain = Sequential()
    brain.add(
        Dense(871,
              input_dim=871,
              activation='relu',
              kernel_regularizer=l2(1e-2)))
Example #51
0
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

directory = '/Users/garethjones/Documents/Data Analysis/Kaggle Intro/Data/'
file = 'train.csv'
data = pd.read_csv(directory + file)
''' CLEAN DATA '''
# A nice way to write a for loop and if statement
cols_with_missing = [col for col in data.columns if data[col].isnull().any()]

# Use Imputer function to fill in NAN values with mean for that column
my_imputer = Imputer()
data_imputed = my_imputer.fit_transform(data)
''' SETUP TEST AND TRAIN VARIABLES '''
# These are the variables we will use to predict something else
predictors = [
    'LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr',
    'TotRmsAbvGrd'
]
X = data[predictors]

# This is what we want to predict
y = data.SalePrice

# Split our dataset into training and testing data
train_X, val_X, train_y, val_y = train_test_split(X,
                                                  y,
                                                  train_size=0.7,
Example #52
0
# drop rows with any missing values
titanic.dropna().shape

# drop rows where Age is missing
titanic[titanic.Age.notnull()].shape

# Sometimes a better strategy is to **impute missing values**:

# fill missing values for Age with the mean age
titanic.Age.fillna(titanic.Age.mean(), inplace=True)

# equivalent function in scikit-learn, supports mean/median/most_frequent
from sklearn.preprocessing import Imputer

imp = Imputer(strategy='mean', axis=1)
titanic['Age'] = imp.fit_transform(titanic.Age).T

# include Age as a feature
feature_cols = ['Pclass', 'Parch', 'Age']
X = titanic[feature_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_test)
print metrics.accuracy_score(y_test, y_pred_class)

# ## Part 2: Confusion Matrix

# confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

# calculate the sensitivity
    'adhe'
]]

Selected_RiskFactor = RiskFactor[[
    'MotherBC', 'Preg1Age_cate', 'Signature_1', 'Signature_2', 'Signature_3',
    'Nonsense_mutation'
]]

#X=Selected_RiskFactor
#X=Selected_Genetics
X = pd.concat([Selected_RiskFactor.reset_index(drop=True), Selected_Genetics],
              axis=1)
X = Selected_Genetics
X = Selected_RiskFactor
imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
X["Benign_Age"] = imputer.fit_transform(X[["Benign_Age"]]).ravel()
imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
X["AgeofMerche"] = imputer.fit_transform(X[["AgeofMerche"]]).ravel()

mapper3 = DataFrameMapper([
    ('Class', sklearn.preprocessing.LabelEncoder()),
    ('MotherBC', sklearn.preprocessing.LabelEncoder()),
    ('Preg1Age_cate', sklearn.preprocessing.LabelEncoder()),
    ('scar', sklearn.preprocessing.LabelEncoder()),
    ('adhe', sklearn.preprocessing.LabelEncoder()),
],
                          default=None)

mapper3 = DataFrameMapper(
    [('MotherBC', sklearn.preprocessing.LabelEncoder()),
     ('Preg1Age_cate', sklearn.preprocessing.LabelEncoder())],
Example #54
0
    # print(model)
    # Loading a saved model
    model = gensim.models.Word2Vec.load('OpinionMiningModel')
    # Shuffling the dataset
    dataset = dataset.sample(frac=1).reset_index(drop=True)
    X = dataset.tweets.values
    y = dataset.sentiment.values

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.2)
    trainDataVecs = getAvgFeatureVecs(X_train, model, num_features)
    testDataVecs = getAvgFeatureVecs(X_test, model, num_features)

    # Using an imputer because simply removing the null values changes the dimensions of the vectors.
    imp = Imputer(missing_values=np.nan, strategy='mean')
    trainDataVecs = imp.fit_transform(trainDataVecs)
    testDataVecs = imp.fit_transform(testDataVecs)
    trainDataVecs = trainDataVecs.reshape(len(X_train), -1)
    testDataVecs = testDataVecs.reshape(len(X_test), -1)
    # print(trainDataVecs)
    print(trainDataVecs.shape)
    print(testDataVecs.shape)
    svd = TruncatedSVD()
    trainDataVecs = svd.fit_transform(trainDataVecs)
    testDataVecs = svd.fit_transform(testDataVecs)
    print(trainDataVecs.shape)
    print(testDataVecs.shape)
    models = ['Logistic Regression', 'Random Forest', 'SVM']
    dictionary = modelselectionword2vec(trainDataVecs, testDataVecs, y_train,
                                        y_test, models)
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing dataset
dataset = pd.read_csv('preprocessing.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

# taking care of missing values
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])

# Encoding values for categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelEncoderX = LabelEncoder()
labelEncodery = LabelEncoder()
X[:, 0] = labelEncoderX.fit_transform(X[:, 0])
y = labelEncodery.fit_transform(y)
oneHotEncoder = OneHotEncoder(categorical_features=[0])
X = oneHotEncoder.fit_transform(X).toarray()

# Splitting dataset for training and testing
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
def remove_missing_values(dataframe):
    imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
    dataframe.ix[:, 3:] = imr.fit_transform(dataframe.ix[:,3:])
    return dataframe
Example #57
0

if __name__ == '__main__':
    rng = np.random.RandomState(0)
    dataset_train = LoadFile(p=r'F:\ODL\dataset\data_train.pickle')
    dataset_test = LoadFile(p=r'F:\ODL\dataset\data_test.pickle')
    dataset_train = (dataset_train - np.min(dataset_train, axis=0)) / (
        np.max(dataset_train, axis=0) - np.min(dataset_train, axis=0))
    dataset_test = (dataset_test - np.min(dataset_test, axis=0)) / (
        np.max(dataset_test, axis=0) - np.min(dataset_test, axis=0))
    imp = Imputer(missing_values='NaN',
                  strategy='mean',
                  axis=0,
                  verbose=0,
                  copy=True)
    dataset_train = imp.fit_transform(dataset_train)
    dataset_test = imp.fit_transform(dataset_test)
    rng.shuffle(dataset_train)
    rng.shuffle(dataset_test)
    print(dataset_train.shape, dataset_test.shape)
    #检查数据
    # print(dataset_train.shape, dataset_test.shape)
    # num_train = Counter(dataset_train[:, -1])
    # num_test = Counter(dataset_test[:, -1])
    # print(num_train)
    # print(num_test)
    #数据预处理去噪
    dataset = np.vstack((dataset_train, dataset_test))
    dataset = dataset[:, :225]
    rng.shuffle(dataset)
    denoising(dataset, training_time=1, is_finishing=True)
Example #58
0
def clean_df4(df, del_rows=True):
    '''
    INPUT: (pandas dataframe) df
    
    OUTPUT: (pandas dataframe) cleaned df
    
    This funtion returns the df cleaned:
    1. Coverts unknown values to NaN
    2. Drops columuns with more than 50% missing values
    3. Remove rows with more than 10% missing values
    4. Clean and convert object columns to numeric. In same cases by hot-encoding.
    5. Drop id column
    6. Fill NaNs with mode.
    7. Drop high correlated columns 
    
    '''

    for column in list(df.columns.values):
        df[column].replace(-1, np.NaN, inplace=True)
    null0 = [
        'ALTERSKATEGORIE_GROB', 'ALTER_HH', 'ANREDE_KZ', 'CJT_GESAMTTYP',
        'GEBAEUDETYP', 'HH_EINKOMMEN_SCORE', 'KBA05_BAUMAX', 'KBA05_GBZ',
        'KKK', 'NATIONALITAET_KZ', 'PRAEGENDE_JUGENDJAHRE', 'REGIOTYP',
        'RETOURTYP_BK_S', 'TITEL_KZ', 'WOHNDAUER_2008', 'W_KEIT_KIND_HH'
    ]
    for column in null0:
        try:
            df[column].replace(0, np.NaN, inplace=True)
        except:
            continue
    null9 = [
        'KBA05_ALTER1', 'KBA05_ALTER2', 'KBA05_ALTER3', 'KBA05_ALTER4',
        'KBA05_ANHANG', 'KBA05_AUTOQUOT', 'KBA05_CCM1', 'KBA05_CCM2',
        'KBA05_CCM3', 'KBA05_CCM4', 'KBA05_DIESEL', 'KBA05_FRAU',
        'KBA05_HERST1', 'KBA05_HERST2', 'KBA05_HERST3', 'KBA05_HERST4',
        'KBA05_HERST5', 'KBA05_KRSAQUOT', 'KBA05_KRSHERST1', 'KBA05_KRSHERST2',
        'KBA05_KRSHERST3', 'KBA05_KRSKLEIN', 'KBA05_KRSOBER', 'KBA05_KRSVAN',
        'KBA05_KRSZUL', 'KBA05_KW1', 'KBA05_KW2', 'KBA05_KW3', 'KBA05_MAXAH',
        'KBA05_MAXBJ', 'KBA05_MAXHERST', 'KBA05_MAXSEG', 'KBA05_MAXVORB',
        'KBA05_MOD1', 'KBA05_MOD2', 'KBA05_MOD3', 'KBA05_MOD4', 'KBA05_MOD8',
        'KBA05_MOTOR', 'KBA05_MOTRAD', 'KBA05_SEG1', 'KBA05_SEG2',
        'KBA05_SEG3', 'KBA05_SEG4', 'KBA05_SEG5', 'KBA05_SEG6', 'KBA05_SEG7',
        'KBA05_SEG8', 'KBA05_SEG9', 'KBA05_SEG10', 'KBA05_VORB0',
        'KBA05_VORB1', 'KBA05_VORB2', 'KBA05_ZUL1', 'KBA05_ZUL2', 'KBA05_ZUL3',
        'KBA05_ZUL4', 'RELAT_AB', 'SEMIO_SOZ', 'SEMIO_FAM', 'SEMIO_REL',
        'SEMIO_MAT', 'SEMIO_VERT', 'SEMIO_LUST', 'SEMIO_ERL', 'SEMIO_KULT',
        'SEMIO_RAT', 'SEMIO_KRIT', 'SEMIO_DOM', 'SEMIO_KAEM', 'SEMIO_PFLICHT',
        'SEMIO_TRADV', 'ZABEOTYP', 'KBA05_HERSTTEMP'
    ]
    for column in null9:
        try:
            df[column].replace(9, np.NaN, inplace=True)
        except:
            continue

    dropcol = [
        'ALTER_KIND4', 'ALTER_KIND3', 'ALTER_KIND2', 'ALTER_KIND1', 'TITEL_KZ',
        'AGER_TYP', 'EXTSEL992', 'KK_KUNDENTYP', 'KBA05_BAUMAX'
    ]
    for col in dropcol:
        try:
            df.drop(col, axis=1, inplace=True)
        except:
            continue

    if del_rows:
        row_nulls = (df.isnull().sum(axis=1) / df.shape[1])
        df.drop(list(row_nulls[row_nulls > 0.1].index.values), inplace=True)

    df['CAMEO_DEUG_2015'].replace('X', np.NaN, inplace=True)
    df['CAMEO_INTL_2015'].replace('XX', np.NaN, inplace=True)
    df['CAMEO_DEUG_2015'] = df['CAMEO_DEUG_2015'].astype(float)
    df['CAMEO_INTL_2015'] = df['CAMEO_INTL_2015'].astype(float)

    df['OST_WEST_KZ'] = df.OST_WEST_KZ.map({'W': 0, 'O': 1})

    columna = 'EINGEFUEGT_AM'
    if columna in (list(df.columns.values)):
        df['year'] = pd.DatetimeIndex(df['EINGEFUEGT_AM']).year
        df.drop('EINGEFUEGT_AM', axis=1, inplace=True)

    df['CAMEO_DEU_2015'].replace('XX', np.NaN, inplace=True)
    df = pd.get_dummies(df,
                        columns=['D19_LETZTER_KAUF_BRANCHE', 'CAMEO_DEU_2015'])

    df = df.astype(float)

    df.drop('LNR', axis=1, inplace=True)

    imputer = Imputer(strategy='most_frequent')
    df_col = list(df.columns.values)
    df_imp = imputer.fit_transform(df)
    df = pd.DataFrame(df_imp, columns=df_col)

    drop = [
        'KBA13_HERST_SONST', 'PLZ8_GBZ', 'PLZ8_HHZ', 'CAMEO_INTL_2015',
        'ANZ_STATISTISCHE_HAUSHALTE', 'LP_LEBENSPHASE_GROB', 'LP_STATUS_GROB',
        'KBA13_KMH_250'
    ]
    df.drop(drop, axis=1, inplace=True)

    return df
Example #59
0
    
    svm_dict = {}
    
    train_directory = '/home/akash/learn/504/final_project/EECS_504_Project/data_train_json'
    test_directory = '/home/akash/learn/504/final_project/EECS_504_Project/data_bad_json'


    # Pepper will take care of getting the data in a nice insidious format
    pepper = DataPrepper()
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    
    for k,v in train_dir_dict.items():
        train_data = []
        train_data = pepper.multi_step(v)
        clf = svm.OneClassSVM(nu = 0.1, kernel='poly')
        train_data_imp = imp.fit_transform(train_data)    
        clf.fit(train_data_imp)
        svm_dict[k] = clf
        y_train_pred = clf.predict(train_data_imp)
        # print ("Train array len", len(y_train_pred))    
        # print ("Train diff", (sum(y_train_pred)))


    

    
# test_directory = '/home/akash/learn/504/final_project/EECS_504_Project/data_bad_json'

# y_test = pepper.multi_step(test_directory)
# y_test_file = '/home/akash/learn/504/final_project/EECS_504_Project/data_train_json/4519661B00000578-4955690-image-a-23_1507304927988_000000000000_keypoints.json'
# y_test_file = '/home/akash/learn/504/final_project/EECS_504_Project/data_train_json/8_000000000000_keypoints.json'
def replace_testnan(dataframe):
    imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
    dataframe = imr.fit_transform(dataframe)
    return dataframe