Exemple #1
0
def bnp_svm(train, test):
	print('bnpsvm')
	## If a value is missing, set it to the average
	imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

	#print("cleaning data")
	train = train.sample(1000)
	## set up training data
	train1 = train.select_dtypes(include=['float64'])
	imp.fit(train1)
	train1 = imp.transform(train1)
	train1 = np.array(train1).astype(float)
	## set up real y
	target = np.array(train['target']).astype(int)


	## set up testing data
	test1 = test.select_dtypes(include=['float64'])
	test1 = imp.transform(test1)
	test1 = np.array(test1).astype(float)



	#print("training...")
	clf = svm.SVC(gamma=0.001, C=100, probability=True)
	#print("testing")
	clf.fit(train1, target)
	#print("predicting")
	yhat = clf.predict_proba(test1)
	return yhat


#print(bnp_svm(train, test))
def test():
    vec = DictVectorizer()
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    for filename in glob.glob(r'../dataset/UCI/*.arff'):
        basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
        print basename
        if basename != DS:
            continue
        # cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
        data = arff.loadarff(filename)[0]
        X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
        imp.fit(X)
        X = imp.transform(X)
        labels = np.array([row[-1] for row in data])
        y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
        random = np.random.permutation(range(len(X)))
        print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
        for iteration in xrange(10):
            X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
            for train, test in kf:
                length, train_size = len(train), 0.1
                X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
                X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
                for R in xrange(2,10):
                    ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])            
                    # print "%s R=%d"%(basename,R),
                    cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)
                exit()
Exemple #3
0
	def clf_fit_transform(self):
		#import dataset
		self.df= pd.read_csv(self.dataset,na_values=["?"])
		
		#clean dataset
		#use median,most_frequent,mean
		imr = Imputer(missing_values='NaN', strategy='mean', axis=0,copy=False)

		imr.fit(self.df)
		X_imputed_df = pd.DataFrame(imr.transform(self.df.values), columns = self.df.columns)


		X_imputed_df.drop(['id'],1,inplace=True)


		X= np.array(X_imputed_df.drop(['class'],1))
		y=np.array(X_imputed_df['class'])

		le= LabelEncoder()
		y=le.fit_transform(y)

	
		#cross validation
		self.X_train, self.X_test ,self.y_train,self.y_test = cross_validation.train_test_split(X,y,test_size=0.2,random_state=0)

		# define the object
		self.stdsc = StandardScaler()

		self.X_train_std= self.stdsc.fit_transform(self.X_train)

		# once it learns it can apply on other inputs
		self.X_test_std= self.stdsc.transform(self.X_test)
def load_datasets(feature_paths, label_paths):
    feature = np.ndarray(shape=(0,41))
    label = np.ndarray(shape=(0,1))
    for file in feature_paths:
        # 使用pandas库的read_table函数读取一个特征文件内容
        # 指定分隔符为逗号 缺失值为问号 文件中不包含表头行
        df = pd.read_table(file, delimiter=',', na_values='?', header=None)
        # 使用Imputer函数,通过设定strategy参数为'mean'
        # 使用平均值对缺失数据补全,fit()函数用于训练预处理器,
        # transform()函数用于生成预处理结果
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        imp.fit(df)
        df = imp.transform(df)
        # 将预处理后的数据加入feature,依次遍历完所有特征文件
        feature = np.concatenate((feature, df))
     
    for file in label_paths:
        # 同上
        df = pd.read_table(file, header=None)
        # 标签文件没有缺失值,所以直接将读取到的新数据加入label集合
        label = np.concatenate((label, df))
         
    label = np.ravel(label)
    # 将特征集合feature与标签集合label返回
    return feature, label
def load_datasets(feature_paths, label_paths):
    '''
    读取特征文件和标签文件并返回
    '''
    #定义feature数组变量,列数量和特征维度一致为41;定义空的标签变量,列数量与标签维度一致为1
    feature = np.ndarray(shape=(0,41))
    label = np.ndarray(shape=(0,1))
    for file in feature_paths:
        #使用pandas库的read_table函数读取一个特征文件的内容,其中指定分隔符为逗号、缺失值为问号且文件不包含表头行
        #df = pd.read_table(file, delimiter=',', na_values='?', header=None)
        
        #pandas.read_csv(数据源, encoding=编码格式为utf-8, parse_dates=第0列解析为日期, index_col=用作行索引的列编号)
        data=pd.read_csv(file,encoding='utf-8',parse_dates=[0],index_col=0)
        #DataFrame.sort_index(axis=0 (按0列排), ascending=True(升序), inplace=False(排序后是否覆盖原数据))
        #data 按照时间升序排列
        #data.sort_index(0,ascending=True,inplace=True)
        
        #使用Imputer函数,通过设定strategy参数为‘mean’,使用平均值对缺失数据进行补全。 
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        #fit()函数用于训练预处理器,transform()函数用于生成预处理结果。
        imp.fit(df)
        df = imp.transform(df)
        #将预处理后的数据加入feature,依次遍历完所有特征文件
        feature = np.concatenate((feature, df))

    #读取标签文件
    for file in label_paths:
        df = pd.read_table(file, header=None)
        label = np.concatenate((label, df))
    #将标签归整化为一维向量    
    label = np.ravel(label)
    return feature, label
Exemple #6
0
    def fit(self, train_x, train_y=None, is_norm=True):
        # Normalization
        if is_norm:
            train_x_min = train_x.min(0)
            train_x_ptp = train_x.ptp(axis=0)

            train_x = train_x.astype(float) - train_x_min / train_x_ptp

            if np.any(train_y):
                train_y = train_y.astype(float) - train_x_min / train_x_ptp

        imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
        imp.fit(train_x)
        if np.isnan(train_x).any():
            log("Found {} NaN values in train_x, so try to transform them to 'mean'".format(np.isnan(train_x).sum()), WARN)
            train_x = imp.transform(train_x)

        if np.any(train_y) and np.isnan(train_y).any():
            log("Found {} NaN values in train_y, so try to transform them to 'mean'".format(np.isnan(train_y).sum()), WARN)
            train_y = imp.transform(train_y)

        if np.any(train_y):
            self.model.fit(train_x, train_y)
        else:
            self.model.fit(train_x)
class ImputeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """

    def __init__(self, columns=None):
        self.columns = columns
        self.imputer = None

    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to impute.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns

        # Fit an imputer for each column in the data frame
        self.imputer = Imputer(missing_values=0, strategy='most_frequent')
        self.imputer.fit(data[self.columns])

        return self

    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        output[self.columns] = self.imputer.transform(output[self.columns])

        return output
def data_preprocessing_descriptive(Extracted_Features,Coma_Features,Corrected_Features):
    lvltrace.lvltrace("LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive")
    tools.separate_coma(Extracted_Features,Coma_Features)
    for root, dirs, files in os.walk(Coma_Features):
        for i in files:
            if not i.startswith('.'):
                input_i=Coma_Features+i
                output_i=Corrected_Features+i
                lines=tools.file_lines(input_i)
                ncol=tools.file_col(input_i)
                if lines >= 2:
                    file = open(output_i, "w")
                    writer=csv.writer(file, lineterminator='\t')
                    
                    data = np.genfromtxt(input_i,delimiter=',')
                    X = data[1:, 2:]
                    neuron_type = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    y = neuron_type[:, 0] # (class)

                    neuron_name = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    z = neuron_name[:, 1] # Neuron names
                    
                    features = np.genfromtxt(input_i,delimiter=',',dtype=None)
                    w = features[0, :] # features names
                    
                    #Replace missing values 'nan' by column mean
                    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
                    imp.fit(X)
                    Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
                    # Output replacement "Nan" values
                    Y=imp.transform(X)
                    #print i
                    #print Y.shape, y.shape,z.shape
                    #print Y.shape[1]
                    
                    ####################
                    for line in xrange(Y.shape[0]+1):
                        for colonne in xrange(Y.shape[1]+2):
                            if line == 0:
                                if colonne == 0:
                                    file.write("%s\t"%y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t"%z[line])
                                    else:
                                        file.write("%s\t"%w[colonne])
                            else:
                                if colonne == 0:
                                    file.write("%s\t"%y[line])
                                else:
                                    if colonne == 1:
                                        file.write("%s\t"%z[line])
                                    else:
                                        file.write("%f\t"%Y[line-1,colonne-2])
                        file.write("\n")
                    #########################
                else:
                    print "Only one morphology !!!"
                file.close()
    lvltrace.lvltrace("LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive")
def eval_func(chromosome):
    t_par = chromosome.getInternalList()
    print("## Start with Individual : " + str(t_par))
        
    eta                 = t_par[0]
    max_depth           = t_par[1]
    subsample           = t_par[2]
    colsample_bytree    = t_par[3]
    n_estimators        = t_par[4]
    test_size           = t_par[5]
    imp_start           = t_par[6]
    num_of_feat_corr    = t_par[7]


    print("## Filling missing data")
    imp = Imputer(missing_values='NaN', strategy=imp_start, axis=0)
    imp.fit(train[features])
    train[features] = imp.transform(train[features])
    test[features] = imp.transform(test[features])

    curr_features = copy.deepcopy(features)

    print("## Creating Random features based on Correlation")
    output_cor = correlation_p[output_col_name].sort_values()

    most_neg_cor = list(output_cor.index[0:num_of_feat_corr].ravel())
    most_pos_cor = list(output_cor.index[(-2-num_of_feat_corr):-2].ravel())

    for f1, f2 in pairwise(most_neg_cor):
        train[f1 + "_" + f2] = train[f1] + train[f2]
        test[f1 + "_" + f2] = test[f1] + test[f2]
        curr_features += [f1 + "_" + f2]

    for f1, f2 in pairwise(most_pos_cor):
        train[f1 + "_" + f2] = train[f1] + train[f2]
        test[f1 + "_" + f2] = test[f1] + test[f2]
        curr_features += [f1 + "_" + f2]


    params = {"objective": "binary:logistic",
              "eta": eta,                                              
              "nthread":3,                                             
              "max_depth": max_depth,                                  
              "subsample": subsample,                                  
              "colsample_bytree": colsample_bytree,                    
              "eval_metric": "logloss",                                
              "n_estimators": n_estimators,                            
              "silent": 1                                              
              }                                                        
    num_boost_round = 10000
    test_size = test_size
    best_score = train_model(curr_features,params,num_boost_round,test_size)
    grid_search_pd.loc[len(grid_search_pd),grid_search_columns] = [eta,max_depth,subsample,colsample_bytree,n_estimators,test_size,imp_start,num_of_feat_corr,best_score]

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    print("########################## Round Time Stamp ==== " + timestamp)

    grid_search_pd.to_csv(grid_search_file, index=False)

    return best_score
Exemple #10
0
def trainSVM(x1,x2,kernel):
    # prepare data  
    x1 = map(list,x1)
    x2 = map(list,x2)
           
    X = x1+x2
    y1 = ones((shape(x1)[0],1))
    y2 = -1*ones((shape(x2)[0],1))
    Y = list(y1)+list(y2)
    Y = ravel(Y)
    #print 'Y'   
    if (kernel == 0):
        svm = LinearSVC()                               #Instantiating the SVM LINEAR classifier.
        params = {'C': [1, 10, 50, 100,200,300]}                    #Defining the params C which will be used by GridSearch. Param C does increase the weight of the 'fails'.
        grid = GridSearchCV(svm, params, cv=5)
    else:
        svm = SVC(probability=True)                                     #Instantiating the SVM RBF classifier.
        params = {'C': [50, 100,200,300]} #Defining the params C & Gamma which will be used by GridSearch. Param C does increase the weight of the 'fails'. Gamma does define the std of a gaussian.
        grid = GridSearchCV(svm, params, cv=5)
        
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X)
    Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)   
    trainData = imp.transform(X)

    grid.fit(trainData, Y)        #Run fit with all sets of parameters.
    model = grid.best_estimator_
    return model
Exemple #11
0
def preprocess_apply(data, missingvaluemethod, preprocessingmethods):
	#imputing missing values
	if missingvaluemethod!=Constants.MISSING_VALUE_METHOD_NONE:
		if missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEAN:
			imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
		elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MEDIAN:
			imp = Imputer(missing_values='NaN', strategy='median', axis=0)
		elif missingvaluemethod==Constants.MISSING_VALUE_METHOD_MOST_FREQUENT:
			imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
		imp.fit(data)
		data=imp.transform(data)
	else:
		data=np.asarray(data)

	#scale data
	res=np.array([])
	for i in range(0,len(preprocessingmethods)):
		field=[[x[i]] for x in data]
		if preprocessingmethods[i]==Constants.SCALING_METHOD_NONE:
			pass
		elif preprocessingmethods[i]==Constants.SCALING_METHOD_STANDARDIZATION:
			scaler=preprocessing.StandardScaler().fit(field)
			field=scaler.transform(field)
		elif preprocessingmethods[i]==Constants.SCALING_METHOD_MINMAX:
			field=preprocessing.MinMaxScaler().fit_transform(field)
		elif preprocessingmethods[i]==Constants.SCALING_METHOD_CATEGORICAL:
			enc = preprocessing.OneHotEncoder()
			enc.fit(field)
			field=enc.transform(field).toarray()
			
		if i==0:
			res = field
		else:
			res = np.concatenate((res, field), axis=1)
	return res
Exemple #12
0
class FeaturePreProcesser():
    def __init__(self):
        pass

    def fit(self,X):
        self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
        self.imputer.fit(X)
        X = self.imputer.transform(X)

        self.std_scaler = StandardScaler()
        self.std_scaler.fit(X)

    def fit_transform(self, X):
        self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
        self.imputer.fit(X)
        X = self.imputer.transform(X)

        self.std_scaler = StandardScaler()
        self.std_scaler.fit(X)
        X = self.std_scaler.transform(X)

        return X
    def transform(self, X):
        X = self.imputer.transform(X)
        X = self.std_scaler.transform(X)
        return X
Exemple #13
0
class ImputerWrapper:
  """ A simple wrapper around Imputer and supports using zero to fill in missing values.
      If entire column is nan it gets filled with 0 to avoid Imputer removing the column.
  """

  def __init__(self, missing_values='NaN', strategy='zero', axis=0, verbose=0, copy=False):
    self.strategy = strategy
    self.imputer = None
    if strategy != 'zero':
      self.imputer = Imputer(missing_values, strategy, axis, verbose, copy)

  def prepare(self, X):
    for j in range(X.shape[1]):
      all_nan = True
      for i in range(X.shape[0]):
        if not numpy.isnan(X[i][j]):
          all_nan = False
          break
      if all_nan:
        logging.info('column %d all nan, filling with 0' % j)
        for i in range(X.shape[0]):
          X[i][j] = 0.0

  def fit(self, X, y=None):
    if self.strategy == 'zero':
      return self
    self.prepare(X)
    self.imputer.fit(X, y)
    return self

  def fit_transform(self, X, y=None, **fit_params):
    if self.strategy == 'zero':
      for i in range(X.shape[0]):
        for j in range(X.shape[1]):
          if numpy.isnan(X[i][j]):
            X[i][j] = 0.0
      return X
    self.prepare(X)
    return self.imputer.fit_transform(X, y, **fit_params)

  def get_params(self, deep=True):
    if self.strategy == 'zero':
      return None
    return self.imputer.get_params(deep)

  def set_params(self, **params):
    if self.strategy == 'zero':
      return self
    self.imputer.set_params(**params)
    return self

  def transform(self, X):
    if self.strategy == 'zero':
      for i in range(X.shape[0]):
        for j in range(X.shape[1]):
          if numpy.isnan(X[i][j]):
            X[i][j] = 0.0
      return X
    return self.imputer.transform(X)
def my_imputer(name,strat,value):
    if value == 0:
        data[name] = data[name].fillna(0)
    imp = Imputer(missing_values=value, strategy=strat, axis=0)
    x = data[name]
    x = x.reshape(-1,1)
    imp.fit(x)
    data[name] = imp.transform(x)
def ImputeAndGetFinalTrainTestData(train,test):
    X_train = train[:,:-1];
    y_train = train[:,-1];
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X_train);
    X_train = imp.transform(X_train);
    X_test = imp.transform(test.as_matrix());
    return (X_train,y_train,X_test)
def imput_data(data):
	numSubsets = data.shape[-1]
	for i in range(numSubsets):
		imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
		imp.fit(data[:,:,i])
		data[:,:,i] = imp.transform(data[:,:,i])
		data[:,-1,i] = preprocessing.scale(data[:,-1,i])
	return data
def test_threshold_SGD():
    train = pandas.read_csv('data/train_v2.csv')
#    test = pandas.read_csv('data/test_v2.csv')
    train_loss = train.loss
       
#    train = train[['f527', 'f528', 'f274', 'f271', 'f2', 'f727', 'f337', 'f431', 'f757']]
#    test = test[['f527', 'f528', 'f274', 'f271', 'f2', 'f727', 'f337', 'f431', 'f757']]
    
#    train = train[['f527', 'f528', 'f274', 'f271']]
#    test = test[['f527', 'f528', 'f274', 'f271']]
    
    imp = Imputer()
    imp.fit(train)
    
    train = imp.transform(train)
#    test = imp.transform(test)
    
    train=pre.StandardScaler().fit_transform(train)
#    test=pre.StandardScaler().fit_transform(test)
    
    
    train_loss_array = train_loss.apply(lambda x: 1 if x>0 else 0).values
    
    clf = SGDClassifier(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=6, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None)
       
    
    clf.fit(train,train_loss_array)      
    train = clf.transform(train, threshold = "1.25*mean")
    print train.shape    
    
    kf = StratifiedKFold(train_loss.values, n_folds=10, indices=False)    

    threshold  = 0.999999999164       
    mean_mae = 0.
    for train_i, test_i in kf:
#        print len(train_i)
        X_train_split, X_test_split, y_train_split, y_test_split = train[train_i], train[test_i], train_loss_array[train_i], train_loss_array[test_i]
        y_test_split_initial = train_loss[test_i].values
        
        clf = SGDClassifier(loss='log', penalty='l2', alpha=1e-4, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, n_jobs=6, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, rho=None, seed=None)
    
        clf.fit(X_train_split,y_train_split)      
        probas_ = clf.predict_proba(X_test_split)
        prediction_proba = probas_[:,1]
        
        predictionIndexes0 = np.where(prediction_proba <= threshold)[0]
        predictionIndexes1 = np.where(prediction_proba > threshold)[0]
        
        prediction = np.asarray([0.] * y_test_split_initial.shape[0])
        prediction[predictionIndexes1] = 10.
        prediction[predictionIndexes0] = 0.
        mae = mean_absolute_error(y_test_split_initial, prediction)
    
        mean_mae += mae
        
        print "Split MAE: " + str(mae)
    mean_mae = mean_mae / 10.
    print "Average MAE: " + str(mean_mae)
def typetransform ( data ):
    if dattype( data ) is unicode:
        le.fit( data )
        data = le.transform( data )
    else:
            imp = Imputer(missing_values='NaN', strategy='mean',axis = 1)
            imp.fit(data)
            data = imp.transform( data )
    return data
Exemple #19
0
def main():
   s = pd.Series([1,2,3,np.NaN,5,6,None])
   imp = Imputer(missing_values='NaN',
                strategy='mean', axis=0)
   imp.fit([1,2,3,4,5,6,7])

   x = pd.Series(imp.transform(s).tolist()[0])
   
   print(x)
Exemple #20
0
def MisingValuesFiller(X_train):
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    X_full = np.array(X_train)
    imp.fit(X_full)
    #print X_full
    X_tests= [[np.nan, 2], [6, np.nan]]

    #print(imp.transform(X_tests))
    return imp
Exemple #21
0
 def impute(self, sample):
     """
     Create a Sample imputation model according to the method specified in 
     the descriptor.
     :param sample: Sample instance to create the imputer for.
     :returns: Imputer instance.
     """
     imp = Imputer(missing_values='NaN', strategy=self.fix_method, axis=0)
     imp.fit(sample.attributes)
     return imp
Exemple #22
0
def imputator(features):
    """Fill in missing values with mean of the remaining samples

    Keyword arguments:
    features -- feature matrix

    """
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(features)
    return imp.transform(features)
Exemple #23
0
	def __init__(self, alldata, labels):
		data = deepcopy(alldata)
		print("00: (%d,%d)" %  (data.shape[0], data.shape[1]))
		imp = Imputer(missing_values='NaN', strategy='median', axis=0)
		imp.fit(data)
		self.data = deepcopy(imp.transform(data))
		print("0: (%d,%d)" %  (self.data.shape[0], self.data.shape[1]))
		le = LabelEncoder()
		le.fit(['f', 't'])
		self.labels = le.transform(labels)
def get_prob(clf, t1, t2, feat_table, feature_names):
    feat_values = apply_feat_fns(t1, t2, feat_table)
    feat_values = pd.Series(feat_values)
    feat_values = feat_values[feature_names]
    v = feat_values.values
    if mg._impute_flag == True:
        imp = Imputer(missing_values='NaN', strategy='median', axis=0)
        imp.fit(v)
        v = imp.transform(v)
    p = clf.predict_proba(v)
    return p[0]
def eliminate_features():

    use_sample = False
    
    if use_sample:
        train = pandas.read_csv('data/train_v2_sample_10k.csv')
#        test = pandas.read_csv('data/test_v2_sample_10k.csv')
        average_best_t = 0.148846575958
    else:
        train = pandas.read_csv('data/train_v2.csv')
#        test = pandas.read_csv('data/test_v2.csv')
         ### To use on full train set
        average_best_t = 0.164463473639
  
    
    train_loss = train.loss
    
    cols = set(train.columns)
    cols.remove('loss')
    cols = list(cols)
    train = train[cols] 
    
    column_names = train.columns.values.tolist()

    
#    train = train[['f527', 'f528', 'f274', 'f271', 'f2', 'f727', 'f337', 'f431', 'f757']]
    

    imp = Imputer()
    imp.fit(train)
    
    train = imp.transform(train)
#    test = imp.transform(test)
    
    train=pre.StandardScaler().fit_transform(train)
#    test=pre.StandardScaler().fit_transform(test)    
    
    train_loss_array_libsvm = train_loss.apply(lambda x: -1 if x>0 else 1).values
#    b = np.delete(train,0,1)
#    c = np.delete(train,1,1)
#    print b.shape[1]
#    print c.shape
    
    
    best_acc = 91.3437
    best_eliminated_features = []
    
    best_features = [18, 289, 290, 17, 402, 19, 560, 16, 287, 310, 403]
    selected_train = train[:,best_features]    

    os.chdir(liblinear_path)
    train_command = "./train -s 5 -c 0.01 -v 5 -e 0.001 /home/ema/Workspace/Projects/Kaggle/Loan_Default_Prediction/data/train_tmp.liblinear"
    datasets.dump_svmlight_file(selected_train, train_loss_array_libsvm, "/home/ema/Workspace/Projects/Kaggle/Loan_Default_Prediction/data/train_selected_f.liblinear", zero_based=False, comment=None, query_id=None)
    generation = 0    
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-s', type=str, help="SST data file", required=True)
	parser.add_argument('-p', type=str, help="Precipitation data file")
	args = parser.parse_args()
	sstFile = args.s
	precFile = args.p 
	sstData = read_file(sstFile)
	sstData = np.transpose(parse_sst(sstData))[564:-74,:]
	imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
	imp.fit(sstData)
	pickle_data(imp.transform(sstData),sstFile)
Exemple #27
0
class RandomForestLearner(Orange.classification.SklFitter):
    def __init__(self, n_estimators=10, max_features="auto",
                 random_state=None, max_depth=3, max_leaf_nodes=5):
        self.params = vars()

    def fit(self, X, Y, W):
        self.imputer = Imputer()
        self.imputer.fit(X)
        X = replace_nan(X, self.imputer)
        rf_model = RandomForest(**self.params)
        rf_model.fit(X, Y.ravel())
        return RandomForestClassifier(rf_model, self.imputer)
Exemple #28
0
def impute(data_dict, keys):
    from sklearn.preprocessing import Imputer
    for key in keys:
        x = [data_dict[k][key] for k in data_dict.keys()]
        imp = Imputer(missing_values='NaN', strategy="mean", axis=1)
        imp.fit(x)
        x = imp.transform(x)[0]
        names = data_dict.keys()
        for j in range(0, len(data_dict.keys())):
            data_dict[names[j]][key] = x[j]

    return data_dict
Exemple #29
0
def start(train_X, train_Y):
	print("Starting imputation of  Training Set...\n")
	imputer = Imputer(missing_values="NaN", strategy='mean', axis=0);
	imputer.fit(train_X);

	train_X = imputer.transform(train_X);

	train_Y = [y if y <= 69.0 else 69.0 for y in train_Y]; #Capping the rain at 69mm/hr
	train_Y = np.array(train_Y);
	print("Imputation  Completed\n")

	parameters_to_try = generateParams();
	print("No of Paramters to test " + str(len(parameters_to_try)));

	print("Copying Parameters");
	results = [];

	#Contruct parameters as  list
	

	batch_size        = 2;
	for i in xrange(0, len(parameters_to_try), batch_size):

		models_to_try     = [ (copy.copy(train_X), copy.copy(train_Y), parameters_to_try[i] ) ];
		print("Releaseing a batch")
		if i+1 < len(parameters_to_try) :
			models_to_try.append( (copy.copy(train_X), copy.copy(train_Y), parameters_to_try[i+1] ) );

		#Create a Thread pool.
		pool              = Pool(2);
		results_t         = pool.map( train_model_wrapper, models_to_try );
		pool.close();
		pool.join();
		del models_to_try;
		results.append(results_t);



	best_params       = None;
	best_crps         = sys.float_info.max;
	for i in range(0, len(results)):
		if results[i][1] < best_crps:
			best_crps   = results[i][1];
			best_params = results[i][0];

	print("Best Params : " + str(best_params));
	print("Best CRPS :   " + str(best_crps));

	estimator               = RandomForestRegressor(**best_params)
	estimator.fit(train_X, train_Y);

	return imputer, estimator;
def buildArraysFromROOT(tree,allowedFeatures,cut,skipEvents,maxEvents,name):
    dataContainer = {}
    featureNames = []
    eventCounter = -1
    gROOT.Reset()
    
    # Get branch names
    for item in tree.GetListOfBranches():
        featureName = item.GetName()
        if featureName in allowedFeatures:
            featureNames.append(featureName)
            dataContainer[featureName] = []

    # Build the event list
    tcut = TCut(cut)
    tree.Draw(">>eventList",tcut)
    eventList = TEventList()
    eventList = gDirectory.Get("eventList")
    nSelectedEvents = eventList.GetN()

    # Event loop
    for i in range(0,nSelectedEvents):
        if (i < skipEvents):
            continue
        if (i % 100 == 0):
            sys.stdout.write("Reading %s: %d%%   \r" % (tree.GetName(),100*i/(maxEvents+skipEvents)) )
            sys.stdout.flush()
        if i >= (maxEvents+skipEvents):
            break
        selectedEvNum = eventList.GetEntry(i)
        tree.GetEntry(selectedEvNum)
        for feature in featureNames:
            dataContainer[feature].append(tree.__getattr__(feature))
    sys.stdout.write("\n")

    # Make the numpy arrays
    outputArray = np.array([])
    for feature in dataContainer.keys():
        column = dataContainer[feature]
        feature_vector = np.asarray(column)
        feature_vector = feature_vector.reshape(feature_vector.size,1)
        if outputArray.shape[0]==0:
            outputArray = feature_vector
        else:
            outputArray = np.concatenate((outputArray,feature_vector),axis=1)
    imp = Imputer(missing_values=-999, strategy='mean', axis=0)
    imp.fit(outputArray)
    outputArray = imp.transform(outputArray)
    print name
    print "Events: ",outputArray.shape[0]
    print "Features: ",outputArray.shape[1]
    return outputArray
def train_LSTM(path_to_store_weight_file=None, number_of_iteration=1):
    #HYPER-PARAMETERS
    input_size = 2436
    output_size = 2
    hidden_size = 100
    num_layers = 1
    batch_size = 151  #number of sequences I want to process in parallel
    num_epochs = 1  #train the data 1 time
    learning_rate = 0.1  #learning rate

    def flatten(list_):
        for el in list_:
            if hasattr(el, "__iter__") and not isinstance(el, basestring):
                for sub in flatten(el):
                    yield sub
            else:
                yield el

    output, test_position = generate_sample_test()
    # generate_sample_test()
    # output= pickle.load(open("test_data.p" , "rb" ))
    # test_position = pickle.load(open("gt_data.p","rb"))
    list = []
    for j in range(151):
        test_data = []
        for i in range(6):
            for obj in vars(output[j][i])["players"]:
                for obj in vars(output[j][i])["players"]:
                    test_data.append(obj.get_info())
            test_data.append(vars(output[j][i])["quarter"])
            test_data.append(vars(output[j][i])["game_clock"])
            test_data.append(vars(output[j][i])["ball"].get_info())
            test_data.append(vars(output[j][i])["shot_clock"])
        list_1_scaled = [x for x in flatten(test_data)]
        list.append(list_1_scaled)

    data = pd.DataFrame(list)
    data_1 = data.copy()
    data_1 = data_1.values
    from sklearn.preprocessing import Imputer
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    imputer = Imputer(strategy="mean")
    imputer.fit(data_1)
    data_1 = imputer.transform(data_1)
    data_1_scaled = scaler.fit_transform(data_1)
    test_data = torch.DoubleTensor(np.array(data_1_scaled))
    test_data = test_data.contiguous()
    test_position = torch.FloatTensor(np.array(test_position))
    test_position = test_position.contiguous()
    y = Variable(test_position)
    x = test_data.view(batch_size, input_size)
    y = y.view(batch_size, output_size)

    model = RNN(input_size, hidden_size, num_layers, len(output))
    print model

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss = torch.nn.MSELoss(size_average=False)

    #begin to train
    for epoch in range(number_of_iteration):
        # Pytorch accumulates gradients. We need to clear them out before each instance
        optimizer.zero_grad()
        model.hidden = model.init_hidden()
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        out = model(x.unsqueeze(1).float())

        #TINC EL PROBLEMA AQUI AMB EL TRAINING DATA

        err = loss(out, y)
        err.backward()
        optimizer.step()

    print('-------done LSTM')
    torch.save(model, path_to_store_weight_file)
#importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#importing dataset
dataset = pd.read_csv('Data.csv')
features = dataset.iloc[:, :-1].values
labels = dataset.iloc[:, 3].values

#taking care of missing data
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
imputer = imputer.fit(features[:, 1:3])
features[:, 1:3] = imputer.transform(features[:, 1:3])

#encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder_features = LabelEncoder()
features[:, 0] = label_encoder_features.fit_transform(features[:, 0])
onehotencoder = OneHotEncoder(categorical_features=[0])
features = onehotencoder.fit_transform(features).toarray()
label_encoder_labels = LabelEncoder()
labels = label_encoder_labels.fit_transform(labels)

#splitting data set into the Training set and Test set
from sklearn.model_selection import train_test_split
def imputting_values(data):
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(data)
    imp.transform(data)
    return data
                                          test_reader.get_number_of_examples())
del test_reader

print "==> elapsed time = %.3f" % (time.time() - prev_time)

print "train.shape ", train_X.shape, train_y.shape
print "val.shape", val_X.shape, val_y.shape
print "test.shape", test_X.shape, test_y.shape

print "==> imputing missing values"
imputer = Imputer(missing_values=np.nan,
                  strategy='mean',
                  axis=0,
                  verbose=0,
                  copy=True)
imputer.fit(train_X)
train_X = np.array(imputer.transform(train_X), dtype=np.float32)
val_X = np.array(imputer.transform(val_X), dtype=np.float32)
test_X = np.array(imputer.transform(test_X), dtype=np.float32)

print "==> normalizing data"
scaler = StandardScaler()
scaler.fit(train_X)
train_X = scaler.transform(train_X)
val_X = scaler.transform(val_X)
test_X = scaler.transform(test_X)

if not os.path.exists("cf_activations"):
    os.mkdir("cf_activations")

if not os.path.exists("cf_results"):
# Data Preprocessing

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(X[:, 1:3])  # Upperbound is excluded, imputer object is fitted to X
X[:, 1:3] = imputer.transform(X[:, 1:3])

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

# Categorical --> Dummy Encoding(one hot coding)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
onehotencoder = OneHotEncoder(
    categorical_features=[0])  # Specify the array number
X = onehotencoder.fit_transform(X).toarray()

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
Exemple #36
0
"""
Spyder Editor

This is a temporary script file.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset = pd.read_csv("C:\\Users\\Admin\\Downloads\\Data_Preprocessing\\Data.csv")
X= dataset.iloc[:,:-1].values
y= dataset.iloc[:,len(dataset.columns)-1].values

from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X[:,1:3])
X[:,1:3]= imputer.transform(X[:,1:3])

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
oneHotEncoder = OneHotEncoder(categorical_features=[0])
labelEncoder_X= LabelEncoder()
X[:,0] = labelEncoder_X.fit_transform(X[:,0])
X=oneHotEncoder.fit_transform(X).toarray()
labelEncoder_y = LabelEncoder()
y = labelEncoder_y.fit_transform(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=0)

#Feature scaling
from sklearn.preprocessing import StandardScaler
dataset.ix[100, 'isFlaggedFraud'] = np.NaN
#check which row has NaN
dataset[dataset['isFlaggedFraud'].isnull()]
#impute mean value in the nan place

#take important columns split train test
dataset_pred = dataset[[
    'step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
    'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud'
]]
X = dataset_pred.loc[:, dataset_pred.columns != 'isFraud'].values
y = dataset_pred.iloc[:, 7].values

#impute mean value
imputer = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imputer = imputer.fit(X[:, 7:8])
X[:, 7:8] = imputer.transform(X[:, 7:8])

#see cor plot of numeric variables not working
#sns.set(style="ticks", color_codes=True)
#g = sns.pairplot(dataset_pred,hue="isFraud")

#use encoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
#labelencoder_X_2 = LabelEncoder()
#X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
#create dummy variable for type 5 type as there are catagory not working
onehotencoder = OneHotEncoder(categorical_features=[
    1
])  #apply in column type as there are more than 2 catagory
Exemple #38
0
# Using Linear Regression
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_1, y_1)

# Predicting the results
test_set = pd.read_csv('../dataset/test.csv')

test_set_1 = test_set.iloc[:, [0, 1, 2, 3, 4, 5, 6, 8, 10]]

X_test_1 = test_set_1.iloc[:, [1, 3, 4, 5, 6, 7, 8]].values

from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X_test_1[:, [2, 5]])
X_test_1[:, [2, 5]] = imputer.transform(X_test_1[:, [2, 5]])

X_test_1[:, 1] = labelencoder_1.transform(X_test_1[:, 1])
X_test_1[:, 6] = labelencoder_2.transform(X_test_1[:, 6])
X_test_1 = onehotencoder_1.transform(X_test_1).toarray()

X_test_1 = X_test_1[:, 1:]

X_test_1 = onehotencoder_2.transform(X_test_1).toarray()

X_test_1 = X_test_1[:, 1:]

X_test_1 = sc_X.transform(X_test_1)

y_pred = classifier.predict(X_test_1)
# In[12]:


# again: our original array
df.values


# In[13]:


# impute missing values via the column mean

from sklearn.preprocessing import Imputer

imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data



# ## Understanding the scikit-learn estimator API

# In[14]:




# In[15]:

Exemple #40
0
## seeing which explanatory feature rows got removed.  Looks like none.
response_series.index[~response_series.index.isin(explanatory_df.index)]

### now, let's seperate the numeric explanatory data from the string data
string_features = explanatory_df.ix[:, explanatory_df.dtypes == 'object']
numeric_features = explanatory_df.ix[:, explanatory_df.dtypes != 'object']

# that are all NANs, as they will show up as all 'Nothing' when we start binning or look for features with no variation)
string_features = string_features.fillna('Nothing')
# cleaning up string features
string_features = cleanup_data(string_features)
# binarizing string features
encoded_data = get_binary_values(string_features)
## imputing features
imputer_object = Imputer(missing_values='NaN', strategy='median', axis=0)
imputer_object.fit(numeric_features)
numeric_features = pandas.DataFrame(imputer_object.transform(numeric_features),
                                    columns=numeric_features.columns)

## pulling together numeric and encoded data.
explanatory_df = pandas.concat([numeric_features, encoded_data], axis=1)
explanatory_df.head()

#now, let's find features with no variance
no_variation = find_zero_var(explanatory_df)
explanatory_df.drop(no_variation['toDelete'], inplace=True)

# deleting perfect correlation
no_correlation = find_perfect_corr(explanatory_df)
explanatory_df.drop(no_correlation['toRemove'], 1, inplace=True)
Exemple #41
0
#plt.xlabel('numbers of features to keep')
#plt.ylabel('ratio of information remains')
#plt.annotate('Point(%d,%.2f)' % (10,variances[9]), xy=(10, variances[9]),
#            xytext=(+10, +0.7), fontsize=15,
#            arrowprops=dict(arrowstyle="->"))
#plt.show()
pca = PCA(n_components=15)
hy_compressed = pca.fit_transform(hy_dummies)
hy_compressed_df = pd.DataFrame(hy_compressed,
                                columns=list(
                                    ['hy' + str(x) for x in range(1, 16)]))
entbase = entbase.join(hy_compressed_df)

#ZCZB
imp_nan = Imputer(missing_values='NaN', strategy='median', axis=0)
imp_nan.fit(entbase.loc[:, ['ZCZB']])
entbase.loc[:, ['ZCZB']] = imp_nan.transform(entbase.loc[:, ['ZCZB']])
imp_0 = Imputer(missing_values=0, strategy='median', axis=0)
imp_0.fit(entbase.loc[:, ['ZCZB']])
entbase.loc[:, ['ZCZB']] = imp_0.transform(entbase.loc[:, ['ZCZB']])
scaler = StandardScaler()
scaler.fit(entbase['ZCZB'])
entbase['ZCZB'] = scaler.transform(entbase['ZCZB'])

#ETYPE
etype_compressed = pd.get_dummies(entbase['ETYPE'])
etype_compressed_df = pd.DataFrame(
    np.array(etype_compressed),
    columns=list(['etype' + str(x)
                  for x in sorted(entbase['ETYPE'].unique())]))
entbase = entbase.join(etype_compressed_df)
Exemple #42
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv('Data.csv')

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values=np.nan)
imputer_out = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

print(X)

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

onehotencoder_X = OneHotEncoder(categorical_features=[0])
X = onehotencoder_X.fit_transform(X).toarray()
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
import pandas

base = pandas.read_csv('E:\\Udemy - Cursos\\MachineLearning\\Arquivos\\CreditData.csv')
base.loc[base.age < 0, 'age'] = 40.92
               
previsores = base.iloc[:, 1:4].values
classe = base.iloc[:, 4].values

from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(previsores[:, 1:4])
previsores[:, 1:4] = imputer.transform(previsores[:, 1:4])

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

from sklearn.cross_validation import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.25, random_state=0)

from sklearn.tree import DecisionTreeClassifier # importação da biblioteca
classificador = DecisionTreeClassifier(criterion = 'entropy', random_state = 0) # criação do classificador e RANDOM faz com que se use sempre as mesmas porções da base de dados
classificador.fit(previsores_treinamento, classe_treinamento)
previsoes = classificador.predict(previsores_teste)

from sklearn.metrics import confusion_matrix, accuracy_score
precisao = accuracy_score(classe_teste, previsoes)
matriz = confusion_matrix(classe_teste, previsoes)
    boy=180
    def kosmak(self,b):
        return b+10
    
ali = people()
print(ali.boy)
print(ali.kosmak(90))


#sci-kit learn = sklearn
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN" , strategy = "mean", axis=0)

Yas = veriler.iloc[:,1:4].values
print(Yas)
imputer=imputer.fit(Yas[:1,1:4])
Yas[:,1:4] = imputer.transform(Yas[:,1:4])
print("\n")
print(Yas)

ulke = veriler.iloc[:,0:1].values
print(ulke)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ulke[:,0] = le.fit_transform(ulke[:,0])
print(ulke)

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features="all")
ulke=ohe.fit_transform(ulke).toarray()
print(ulke)
# importing libraries
import pandas as pd

# importing dataset
# create matrix of independent variables(features)
data_X = pd.read_csv('secom.data.txt', sep=' ')
X = data_X.values

#create dependent variable vector
data_y = pd.read_csv('secom_labels.data.txt', sep=' ')
y = data_y.iloc[:, 0].values

# handling missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X)
X = imputer.transform(X)

# L1 based feature selection
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

# SelectFromModel and Logistic Regression
logreg = LogisticRegression(random_state=0)
logreg.fit(X, y)
model = SelectFromModel(logreg)
X_logreg = model.fit(X, y)
#X_logreg.shape

# Get indices of selected features
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1]
X = X.iloc[:, 1:].values
y = dataset.iloc[:, 3].values

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)

# Taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X[:, 0:2])
X[:, 0:2] = imputer.transform(X[:, 0:2])

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
"""PRATEEK"""
Exemple #47
0
from sklearn.preprocessing import Imputer
import numpy as np
from sklearn.decomposition import PCA

data = np.loadtxt("secom.data")
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(data)
data = imp.transform(data)

pca = PCA(n_components=6)

pca.fit(data)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
print(np.shape(pca.components_))
Exemple #48
0
#importation de package

import statsmodels as stat
import seaborn as sbrn
import pandas as pds
import matplotlib.pyplot as mplt
import numpy as np

dtst = pds.read_csv("credit_immo.csv")
X = dtst.iloc[:, -9:-1].values
Y = dtst.iloc[:, -1].values

#data cleaning
from sklearn.preprocessing import Imputer
imptr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imptr.fit(X[:, 0:1])
imptr.fit(X[:, 7:8])
X[:, 0:1] = imptr.transform(X[:, 0:1])
X[:, 7:8] = imptr.transform(X[:, 7:8])

#données categoriques

## Codage de la variable independant
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labEncre_X = LabelEncoder()
X[:, 2] = labEncre_X.fit_transform(X[:, 2])
X[:, 5] = labEncre_X.fit_transform(X[:, 5])
onehotEncr = OneHotEncoder(categorical_features=[2])
onehotEncr = OneHotEncoder(categorical_features=[5])
X = onehotEncr.fit_transform(X).toarray()
Exemple #49
0
Churning_train = "train/churn_train.csv"
Churning_test = "test/churn_train.csv"
Churning_pred = "Validation/FinalPred.csv"


def load_churning_data(datapath):
    return pd.read_csv(datapath)


#load the dataset

Churning_train_dataset = load_churning_data(Churning_train)
Churning_train_dataset.head(100)
Churning_train_dataset.info()
#finding the data distribution in various states
Churning_train_dataset['st'].value_counts()
Churning_train_dataset.describe()
#find the variable distribution
Churning_train_dataset.hist(bins=50, figsize=(20, 15))
Churning_train_dataset.nummailmes = Churning_train_dataset.nummailmes.replace(
    0, np.NaN)
Churning_train_dataset.hist(bins=50, figsize=(20, 15))
#finding correlation matrix
corr_matrix = Churning_train_dataset.corr()
#creating imputer object
Churning_train_dataset.nummailmes.fillna(0)
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(Churning_train_dataset.iloc[:, 5].reshape(-1, 1))
Churning_train_dataset.iloc[:, 5] = imputer.transform(
    Churning_train_dataset.iloc[:, 5].reshape(-1, 1)).reshape(-1)
print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []

imp = Imputer(missing_values=0, strategy="median", axis=0)
scl = StandardScaler(copy=True, with_mean=True, with_std=True)

print("augment features")
build_features(features, train)
build_features([], test)
print(features)

imp.fit(train[features])
scl.fit(train[features])

imp.fit(test[features])
scl.fit(test[features])

train[features] = imp.transform(train[features])
test[features] = imp.transform(test[features])

train[features] = scl.transform(train[features])
test[features] = scl.transform(test[features])

params = {"objective": "reg:linear",
          "eta": 0.3,
          "max_depth": 8,
          "subsample": 0.7,
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Imputer
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

imputer = Imputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

# Score on the training set was:0.9931558441558442
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    GaussianNB()
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemple #52
0
def classifier(training_data,data_false,directory,priori_prob,shape_y,f_type):
    #function to compute the classifier for a label and then save it
    '''this function creates the initial 1st classifier.'''
    print '\n'
    
    #print 'Company is :',directory
    #for i in range(4):    
        #training_dataf= np.asarray(training_dataf)
    #print 'shape of false data',data_false.shape
    #print 'shape of training_dataf',training_dataf.shape
        
    training_dataf= np.asarray(data_false) 
    #training_dataf = training_dataf[:count1,:] #false training data so that both havesame size
    
    r1,c1 = training_data.shape
    r2,c2 = training_dataf.shape   
    label_true = []
    label_false = []
    #--creating labels for true and false data--#
    for m in range(r1):
        label_true.append(1)
    
    for n in range(r2):
        label_false.append(0)
        
    label_true = np.asarray(label_true)
    label_false = np.asarray(label_false) 
    #print 'b4imputer'
    #--removing nans by the medians--#
    imp = Imputer(strategy = 'median')
    imp.fit(training_data)
    training_data = imp.transform(training_data)
    
    imp.fit(training_dataf)
    training_dataf = imp.transform(training_dataf)
    #print 'after'
    #--final training data---#
    final_training = np.concatenate((training_data,training_dataf))
    temp3,temp4 = final_training.shape
    
    #----------creating labels for final_training------------#
    final_labels= np.concatenate((label_true,label_false))
    
    #print 'shape of ifnal ddata',final_labels.shape,final_training.shape
    #--generating testing and training data randomly--#
    #X_train, X_test, y_train, y_test = train_test_split(final_training, final_labels, train_size=0.80, random_state=42)        
    #X_train,y_train,f2 = split_new(final_training,final_labels,0.8) #split the training and testing data
    #--creating instance of random forest---#
    #print 'final training'
    X_train = final_training
    y_train = final_labels
    temp1,temp2 = X_train.shape
    #print 'teri makk',temp1, temp2
    est = RandomForestClassifier(n_estimators =20,max_features='auto',max_depth=None,min_samples_split=2,min_samples_leaf=1,min_weight_fraction_leaf=0,max_leaf_nodes=None,n_jobs=1)
    #--fitting data and labels--#
    est.fit(X_train,y_train)      #make trees from trainning data and labels
    #x_train is the training data, and y_train are there labels.
    #print 'score',est.score(X_test,y_test)
    Location = classi+f_type
    #print 'Location',Location
    try :
        os.stat(Location)
    except :
        os.mkdir(Location)
              
               
    
    
    save_location = Location+'/'+directory+'_'+str(0)+'.pkl'
    #print 'shape',test_data.shape
    joblib.dump(est, save_location,compress=9)#only save the classifier not the data..
    #0 is sent to check the recusrion depth
    ret = re_train_prefilt(save_location,directory,X_train,y_train,shape_y,0,f_type)
Exemple #53
0
# Normalizzazione  dei dati

# elimino una colonna
x_train = x_train.drop('Cabin', 1)
x_test = x_test.drop('Cabin', 1)
x_train = x_train.drop('Ticket', 1)
x_test = x_test.drop('Ticket', 1)
x_train = x_train.drop('Name', 1)
x_test = x_test.drop('Name', 1)

print('Errori')
print(x_train.isnull().sum())
from sklearn.preprocessing.imputation import Imputer

imr = Imputer(missing_values='NaN', strategy='mean', axis=1)
imr = imr.fit(x_train)
imputed_data = imr.transform(x_train.values)
print('trasf')
print(imputed_data[:200])

imr = Imputer(missing_values='NaN', strategy='mean', axis=1)
imr = imr.fit(x_test)
imputed_data2 = imr.transform(x_test.values)
print('trasf')
print(imputed_data2[:200])

std = StandardScaler()
x_train_std = std.fit_transform(imputed_data)
x_test_std = std.fit_transform(imputed_data2)

print(x_train_std)
print(df.columns)
print(type(df))

# Replacing Y with 1 and N with 0
df = df.replace('y', 1)
df = df.replace('n', 0)
df = df.replace('republican', 1)
df = df.replace('democrat', 0)
#print('After replacement:', df)

df = df.replace('?', np.NaN)  # Replace missing value with NaN
print('NaN replaced data set: ', df)
#print(df.isnull().head())

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(df, (435, 17))
df_clean = imp.transform(df)
print('Clean Data Set:', df_clean)
print(df_clean.shape)
print(type(df_clean))

column_list = [
    'republican', 'handicapped-infants', 'water-project-cost-sharing',
    'adoptionof-the-budget-resolution', 'physician-fee-freeze',
    'el-salvador-aid', 'eligious-groups-inschools', 'anti-satellite-test-ban',
    'aid-to-nicaraguan-contras', 'mx-missile', 'immigration',
    'synfuelscorporation-cutback', 'education-spending',
    'superfund-right-to-sue', 'crime', 'dutyfree-exports',
    'export-administration-act-south-africa'
]
Exemple #55
0
if __name__ == '__main__':
    impt = Imputer()
    scal = MinMaxScaler()
    train = pd.read_csv("broadband_train.csv", sep=",", encoding='gbk')
    train['GENDER'] = train['GENDER'].replace('男', 0).replace('女', 1)
    train['AUTOPAY'] = train['AUTOPAY'].replace('否', 0).replace('是', 1)
    train = train.apply(lambda s: format_series(s, True))
    test = pd.read_csv("broadband_test.csv", sep=",", encoding='utf-8')
    test['GENDER'] = test['GENDER'].replace('男', 0).replace('女', 1)
    test['AUTOPAY'] = test['AUTOPAY'].replace('否', 0).replace('是', 1)
    test = test.apply(lambda s: format_series(s, False))
    train_X = train.iloc[:, 1:-1]
    train_Y = train.iloc[:, -1]
    test_X = test.iloc[:, 1:]

    impt.fit(train_X)
    train_X = impt.transform(train_X)
    test_X = impt.transform(test_X)

    scal.fit(train_X)
    train_X = scal.transform(train_X)
    test_X = scal.transform(test_X)

    model = svm.SVC()
    model.fit(train_X, train_Y)
    print(cross_val_score(model, train_X, train_Y))
    data = model.predict(test_X)
    print(data)
    #[ 0.83532934  0.84984985  0.81927711]
    #[ 0.83532934  0.84984985  0.81927711]
    #[ 0.83532934  0.84984985  0.81927711]
#dataset.drop('dma', axis=1, inplace=True)

#print("column number")
#print(dataset.columns,len(dataset.columns),len(dataset.index))

dt = dataset.values
d = dt.astype(float)

#print("Checkinf for NaN and Inf")
#print( "np.nan=", np.where(np.isnan(d)))
#print( "is.inf=", np.where(np.isinf(d)))
#
print("********************************************")
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(d)
d = imp.fit_transform(d)

##print("values after encoding", values)
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(d)
##print("scaled values",scaled)
# specify the number of lag hours
n_hours = 4
n_features = len(dataset.columns)
n_ahead = 1
st = n_hours*n_features
# frame as supervised learning
reframed = series_to_supervised(scaled, n_hours, n_ahead)
#print("column number")
cols = list(dataset.columns.values)
cols.pop(cols.index('revenue_class')) 
dataset = dataset[cols+['revenue_class']]

dataset.drop(dataset.columns[-10], axis=1, inplace = True)
dataset.drop(dataset.columns[-24], axis=1, inplace = True)
 
# dependent and inpendent variables
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values


# Taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:,:])
X[:,:] = imputer.transform(X[:,:])

from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X = sel.fit_transform(X)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
class DataSetBuilder:
    '''
     a Class build to combine feature extraction for catagories, text and numeric data
     Defaults to using mutliprocessing
     use if __name__ == '__main__': before fit and transform calls on windows
    '''
    def __init__(self, params=None, col_dict=None):
        '''

        :param params:
        :param col_dict: dictionary with keys 'cat_cols',  text_cols', 'imputer_cols'. 'zero_imputer_cols',
        the values are the column names in a pandas data frame to prepreocess
        '''
        from nltk.corpus import stopwords
        self.default_params = {'text_cols': {'max_features': 200, 'min_freq': 0.001, 'ngram_range': (1, 1),
                                             'min_len': 3, 'stop_words': set(stopwords.words('english'))},
                               'cat_cols': {'min_freq': 0.01},
                               'imputer_cols': {'strategy': 'median'}}
        if params is None:
            self.params = self.default_params
        else:
            self.update_params(params)
        self.par = True
        self.cat_encoder = None
        self.text_encoder = None
        self.col_dict = col_dict
        self.imputer = None
        self.feature_names = []

    def update_params(self, params):
        new_params = self.default_params
        for p in params.keys():
            temp_params = params[p]
            for pp in temp_params.keys():
                new_params[p][pp] = temp_params[pp]
        self.params = new_params

    def fit(self, data):
        '''

        :param data: pandas data frame containing all the columns listed in col_dict
        :return: none, encoders are saved within class
        '''
        col = 'text_cols'
        if col in self.col_dict.keys():
            print('fitting', col, ':', self.col_dict[col])
            self.text_encoder = TextFeature()
            self.text_encoder.par = self.par
            self.text_encoder.max_features = self.params[col]['max_features']
            self.text_encoder.min_freq = self.params[col]['min_freq']
            self.text_encoder.ngram_range = self.params[col]['ngram_range']
            self.text_encoder.min_len = self.params[col]['min_len']
            self.text_encoder.stop_words = self.params[col]['stop_words']
            self.text_encoder.fit(data[self.col_dict[col]])
            self.feature_names = self.feature_names + self.text_encoder.feature_names
        col = 'cat_cols'
        if col in self.col_dict.keys():
            print('fitting', col, ':', self.col_dict[col])
            self.cat_encoder = CatEncoder()
            self.cat_encoder.par = self.par
            self.cat_encoder.min_freq = self.params[col]['min_freq']
            self.cat_encoder.fit(data[self.col_dict[col]])
            self.feature_names = self.feature_names + self.cat_encoder.feature_names
        col = 'imputer_cols'
        if col in self.col_dict.keys():
            print('fitting', col, ':', self.col_dict[col])
            from sklearn.preprocessing import Imputer
            self.imputer = Imputer(strategy=self.params[col]['strategy'])
            self.imputer.fit(data[self.col_dict[col]])
            self.feature_names = self.feature_names + self.col_dict[col]
        col = 'zero_imputer_cols'
        if col in self.col_dict.keys():
            self.feature_names = self.feature_names + self.col_dict[col]

    def transform(self, data):
        '''

        :param data: a pandas data frame with all the columns listed in col_dict
        :return: scipy sparse matrix of features
        '''
        from scipy import sparse
        self.cat_encoder.par = self.text_encoder.par = self.par
        output_list = []
        col = 'text_cols'
        if col in self.col_dict.keys():
            output_list.append(self.text_encoder.transform(data[self.col_dict[col]]))
            print('transforming', col, ':', self.col_dict[col])
        col = 'cat_cols'
        if col in self.col_dict.keys():
            print('transforming', col, ':', self.col_dict[col])
            output_list.append(self.cat_encoder.transform(data[self.col_dict[col]]))
        col = 'imputer_cols'
        if col in self.col_dict.keys():
            print('transforming', col, ':', self.col_dict[col])
            output_list.append(sparse.csr_matrix(self.imputer.transform(data[self.col_dict[col]])))
        col = 'zero_imputer_cols'
        if col in self.col_dict.keys():
            import pandas as pd
            print('transforming', col, ':', self.col_dict[col])
            output_list.append(sparse.csr_matrix(data[self.col_dict[col]].fillna(0)))
        output = sparse.hstack(output_list)
        return output
Exemple #59
0
def train(train_X, train_Y, feature_names):
    imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    enc = OneHotEncoder(categorical_features=np.array([65, 66]),
                        sparse=False,
                        n_values=80)

    imp.fit(train_X)
    train_X = imp.transform(train_X)
    """
    enc.fit(train_X);
    train_X = enc.transform(train_X);
    """

    print("No of features :  " + str(len(train_X[0])))

    train_Y = np.array(train_Y)

    dtrain = xgb.DMatrix(train_X, label=train_Y)

    parameters_to_try = generateParams()

    best_params = None
    overall_best_auc = 0
    overall_best_nrounds = 0

    for i in range(0, len(parameters_to_try)):
        param = parameters_to_try[i]
        num_round = 2000

        bst_cv = xgb.cv(param,
                        dtrain,
                        num_round,
                        nfold=20,
                        metrics={'auc'},
                        show_stdv=False,
                        seed=0)

        best_iteration = 0
        best_auc = 0
        for i in range(0, len(bst_cv)):
            eval_result = bst_cv[i].split("\t")
            val_auc = float(eval_result[1].split(":")[1])
            if val_auc > best_auc:
                best_auc = val_auc
                best_iteration = int(eval_result[0].replace("[", "").replace(
                    "]", ""))

        print("\n Best AUC : " + str(best_auc) + " for Params " + str(param) +
              " occurs at " + str(best_iteration))

        if best_auc > overall_best_auc:
            overall_best_auc = best_auc
            best_params = copy.copy(param)
            overall_best_nrounds = best_iteration

    print(
        "\n Training the model on the entire training set with the best params"
    )

    bst = xgb.train(best_params, dtrain, overall_best_nrounds)
    print("\n\n Overall Best AUC : " + str(overall_best_auc) + " for Params " +
          str(best_params) + " occurs at " + str(best_iteration))
    feature_imp = bst.get_fscore()

    print("Feature Importance ... ")

    for w in sorted(feature_imp, key=feature_imp.get, reverse=True):
        print(
            str(feature_names[int(w.replace("f", ""))]) + " : " +
            str(feature_imp[w]))

    return bst, imp, enc
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Wine_Quality_Data.csv')
dataset.describe()
dataset.hist()
#import seaborn as sns

X = dataset.iloc[:, :-1].values

# Taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X[:, [0]])
X[:, [0]] = imputer.transform(X[:, [0]])

pd.set_option('precision', 3)
cor = dataset.corr(method='pearson')
#Selected these features using feature_importances method after applying Random Classification Method
X = dataset.iloc[:, [5, 6]].values

# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')