def CrossValidationWithSampling():
	runObj = run_util.run()

	## Load data
	((X_train,Y_train),(X_test)) = loadData(runObj)
	
	## Sample train data 
	RSObj=RS.randomSampling()
	(X_train,Y_train) = RSObj.getRandomSample(X_train,Y_train)
	(X_train,Y_train) = runObj.randomShuffleData(X_train,Y_train)
	print('\n Size of X_train : {0}').format(X_train.shape)
	print('\n Size of Y_train : {0}').format(Y_train.shape)
	X_train.reset_index(drop=True,inplace=True)
	Y_train.reset_index(drop=True,inplace=True)

	## Cross Validation
	kf = KFold(X_train.shape[0], n_folds=5)
	for train_index, test_index in kf:
		X_train_CV, X_test_CV = X_train.ix[train_index], X_train.ix[test_index]
		Y_train_CV, Y_test_CV = Y_train.ix[train_index], Y_train.ix[test_index]
		print('\n Size of X_train_CV : {0}').format(X_train_CV.shape)
		print('\n Size of Y_train_CV : {0}').format(Y_train_CV.shape)
		print('\n Size of X_test_CV : {0}').format(X_test_CV.shape)
		print('\n Size of Y_test_CV : {0}').format(Y_test_CV.shape)

		## Run classifiers
		runObj.runClassifier(X_train_CV,Y_train_CV,X_test_CV,Y_test_CV)
def RFEWithSampling():
	runObj = run_util.run()

	## Load data
	((X_train,Y_train),(X_test)) = loadData(runObj)
	
	## Sample train data 
	RSObj=RS.randomSampling()
	(X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train)
	(X_train,Y_train) = runObj.randomShuffleData(X_train,Y_train)
	print('\n Size of Sample X_train : {0}').format(X_train.shape)
	print('\n Size of Sample Y_train : {0}').format(Y_train.shape)
	X_train.reset_index(drop=True,inplace=True)
	Y_train.reset_index(drop=True,inplace=True)

	## RFE Feature Selection
	print('\n RFE Feature Selection starts...')
	selected_columns = AF.RFE_featureSelection(X_train,Y_train)
	print('\n RFE Feature Selection ends...')
	X_train = X_train[selected_columns]
	X_test = X_train[selected_columns]

	## Split data into 66% train and 33% test
	((X_train_S,Y_train_S), (X_test_S,Y_test_S)) = runObj.splitDataset(X_train,Y_train)
	print('\n Size of X_train_S : {0}').format(X_train_S.shape)
	print('\n Size of Y_train_S : {0}').format(Y_train_S.shape)
	print('\n Size of X_test_S : {0}').format(X_test_S.shape)
	print('\n Size of Y_test_S : {0}').format(Y_test_S.shape)

	## Run classifiers
	runObj.runClassifier(X_train_S,Y_train_S,X_test_S,Y_test_S)
def costValuesBasedPrediction():
	runObj = run_util.run()

	## Load data
	((X_train,Y_train),(X_test)) = loadData(runObj)

	## Select feature related to cost
	costFeatures = ['MMRAcquisitionAuctionAveragePrice','VehBCost']
	X_train = X_train[costFeatures]
	print('\n Size of X_train only cost features : {0}').format(X_train.shape)

	## Sample train data 
	RSObj=RS.randomSampling()
	(X_train,Y_train) = RSObj.getRandomSample(X_train,Y_train)
	(X_train,Y_train) = runObj.randomShuffleData(X_train,Y_train)
	print('\n Size of X_train : {0}').format(X_train.shape)
	print('\n Size of Y_train : {0}').format(Y_train.shape)
	X_train.reset_index(drop=True,inplace=True)
	Y_train.reset_index(drop=True,inplace=True)

	## Split data into 66% train and 33% test
	((X_train_S,Y_train_S), (X_test_S,Y_test_S)) = runObj.splitDataset(X_train,Y_train)
	print('\n Size of X_train_S : {0}').format(X_train_S.shape)
	print('\n Size of Y_train_S : {0}').format(Y_train_S.shape)
	print('\n Size of X_test_S : {0}').format(X_test_S.shape)
	print('\n Size of Y_test_S : {0}').format(Y_test_S.shape)

	## Run classifiers
	runObj.runClassifier(X_train_S,Y_train_S,X_test_S,Y_test_S)
def RFE_featureSelection(X_train,Y_train):
	## Sampling
	RSObj=randomSampling.randomSampling()
	(X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train)
	X_train.reset_index(drop=True,inplace=True)
	Y_train.reset_index(drop=True,inplace=True)

	## Select classifier and parameters
	logistic = linear_model.LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
	          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
	          penalty='l1', random_state=None, solver='liblinear', tol=0.01,
	          verbose=0, warm_start=False)

	## Initialiaze RFE
	rfecv = RFECV(estimator=logistic, step=1, cv=5,
	              scoring='recall')

	## Fit data
	rfecv.fit(X_train, Y_train)

	## Selected Features
	print("Optimal number of features : %d" % rfecv.n_features_)

	## Plot importance
	plt.figure()
	plt.xlabel("Number of features selected")
	plt.ylabel("Cross validation score")
	plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
	plt.show()

	#print('\n Selectd Columns : {0}').format(list(rfecv.support_))
	print('\n Selectd Columns : {0}').format(X_train.columns[list(rfecv.support_)])
	selected_columns = X_train.columns[list(rfecv.support_)]
	return selected_columns
def RFE_featureSelection(X_train, Y_train):
    ## Sampling
    RSObj = randomSampling.randomSampling()
    (X_train, Y_train) = RSObj.getRandomSample(X_train, Y_train)
    X_train.reset_index(drop=True, inplace=True)
    Y_train.reset_index(drop=True, inplace=True)

    ## Select classifier and parameters
    logistic = linear_model.LogisticRegression(C=10,
                                               class_weight=None,
                                               dual=False,
                                               fit_intercept=True,
                                               intercept_scaling=1,
                                               max_iter=100,
                                               multi_class='ovr',
                                               n_jobs=1,
                                               penalty='l1',
                                               random_state=None,
                                               solver='liblinear',
                                               tol=0.01,
                                               verbose=0,
                                               warm_start=False)

    ## Initialiaze RFE
    rfecv = RFECV(estimator=logistic, step=1, cv=5, scoring='recall')

    ## Fit data
    rfecv.fit(X_train, Y_train)

    ## Selected Features
    print("Optimal number of features : %d" % rfecv.n_features_)

    ## Plot importance
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    #print('\n Selectd Columns : {0}').format(list(rfecv.support_))
    print('\n Selectd Columns : {0}').format(X_train.columns[list(
        rfecv.support_)])
    selected_columns = X_train.columns[list(rfecv.support_)]
    return selected_columns
def Boosting_featureSelection(X_train, Y_train):
    ## Feature selection based on GradientBoostedTrees

    ## Sampling
    RSObj = randomSampling.randomSampling()
    (X_train, Y_train) = RSObj.getRandomSample(X_train, Y_train)
    X_train.reset_index(drop=True, inplace=True)
    Y_train.reset_index(drop=True, inplace=True)

    ## Build a forest and compute the feature importances
    forest = ExtraTreesClassifier(n_estimators=100)

    ## Fit Forest
    forest.fit(X_train, Y_train)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    ## Print the feature ranking
    print("Feature ranking:")
    cols = list(X_train.columns)
    for f in range(X_train.shape[1]):
        print("%d. feature %s (%f)" %
              (f + 1, cols[indices[f]], importances[indices[f]]))

    ## Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(X_train.shape[1]),
            importances[indices],
            color="r",
            yerr=std[indices],
            align="center")
    plt.xticks(range(X_train.shape[1]), [cols[i] for i in indices])
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

    ## Generate dictionary of column importance
    cols = [cols[i] for i in indices]
    dictionary = dict(zip(cols, importances))
    return dictionary
def Boosting_featureSelection(X_train,Y_train):
	## Feature selection based on GradientBoostedTrees
	
	## Sampling
	RSObj=randomSampling.randomSampling()
	(X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train)
	X_train.reset_index(drop=True,inplace=True)
	Y_train.reset_index(drop=True,inplace=True)

	## Build a forest and compute the feature importances
	forest = ExtraTreesClassifier(n_estimators=100)

	## Fit Forest
	forest.fit(X_train, Y_train)
	importances = forest.feature_importances_
	std = np.std([tree.feature_importances_ for tree in forest.estimators_],
	             axis=0)
	indices = np.argsort(importances)[::-1]

	## Print the feature ranking
	print("Feature ranking:")
	cols = list(X_train.columns)
	for f in range(X_train.shape[1]):
	    print("%d. feature %s (%f)" % (f + 1, cols[indices[f]], importances[indices[f]]))

	## Plot the feature importances of the forest
	plt.figure()
	plt.title("Feature importances")
	plt.bar(range(X_train.shape[1]), importances[indices],
	       color="r", yerr=std[indices], align="center")
	plt.xticks(range(X_train.shape[1]), [cols[i] for i in indices])
	plt.xlim([-1, X_train.shape[1]])
	plt.show()

	## Generate dictionary of column importance
	cols = [cols[i] for i in indices] 
	dictionary = dict(zip(cols,importances))
	return dictionary
def BoostingWithSampling():
	runObj = run_util.run()

	## Load data
	((X_train,Y_train),(X_test)) = loadData(runObj)
	
	## Sample train data 
	RSObj=RS.randomSampling()
	(X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train)
	(X_train,Y_train) = runObj.randomShuffleData(X_train,Y_train)
	print('\n Size of X_train : {0}').format(X_train.shape)
	print('\n Size of Y_train : {0}').format(Y_train.shape)
	X_train.reset_index(drop=True,inplace=True)
	Y_train.reset_index(drop=True,inplace=True)

	## Boosting Feature Selection
	print('\n Boosting Feature Selection starts...')
	selected_columns_dict = AF.Boosting_featureSelection(X_train,Y_train)
	print('\n Boosting Feature Selection ends...')
	selected_columns=[]
	for col, imp in selected_columns_dict.iteritems():
		if(imp > 0.005):
			selected_columns.append(col)
	print('\n Selected Cols : {0}').format(len(selected_columns))
	
	X_train = X_train[selected_columns]
	X_test = X_train[selected_columns]

	## Split data into 66% train and 33% test
	((X_train_S,Y_train_S), (X_test_S,Y_test_S)) = runObj.splitDataset(X_train,Y_train)
	print('\n Size of X_train_S : {0}').format(X_train_S.shape)
	print('\n Size of Y_train_S : {0}').format(Y_train_S.shape)
	print('\n Size of X_test_S : {0}').format(X_test_S.shape)
	print('\n Size of Y_test_S : {0}').format(Y_test_S.shape)

	## Run classifiers
	runObj.runClassifier(X_train_S,Y_train_S,X_test_S,Y_test_S)
def handpickWithSampling():
	runObj = run_util.run()

	## Load data
	((X_train,Y_train),(X_test)) = loadData(runObj)
	
	## Sample train data 
	RSObj=RS.randomSampling()
	(X_train,Y_train) = RSObj.getRandomSample(X_train,Y_train)
	(X_train,Y_train) = runObj.randomShuffleData(X_train,Y_train)
	print('\n Size of X_train : {0}').format(X_train.shape)
	print('\n Size of Y_train : {0}').format(Y_train.shape)
	X_train.reset_index(drop=True,inplace=True)
	Y_train.reset_index(drop=True,inplace=True)

	## Split data into 66% train and 33% test
	((X_train_S,Y_train_S), (X_test_S,Y_test_S)) = runObj.splitDataset(X_train,Y_train)
	print('\n Size of X_train_S : {0}').format(X_train_S.shape)
	print('\n Size of Y_train_S : {0}').format(Y_train_S.shape)
	print('\n Size of X_test_S : {0}').format(X_test_S.shape)
	print('\n Size of Y_test_S : {0}').format(Y_test_S.shape)

	## Run classifiers
	runObj.runClassifier(X_train_S,Y_train_S,X_test_S,Y_test_S)
        print('\n predictions : {0}').format(predictions.shape[0])
        for i in range(RefId.shape[0]):
            data.append([RefId[i],predictions[i]])
        print('\n Data : \n {0}').format(data)
        with open(fileName, 'wb') as fp:
            a = csv.writer(fp, delimiter=',')
            a.writerows(data)



if __name__=="__main__":
    fileName='updatedTraining.csv'
    dataLoadObj=dataLoad()
    Traindataset=dataLoadObj.loadData(fileName,'train')
    ############ Random Sampling ###################
    RSObj=RS.randomSampling()
    Traindataset=RSObj.getRandomSample(Traindataset)
    print('\n Train : {0}').format(Traindataset.shape)
    '''
    fileName='updatetest.csv'
    testDataset=dataLoadObj.loadData(fileName,'test')
    testDataset=testDataset[0:-6]
    print('\n Test : {0}').format(testDataset.shape)
    dataLoadObj.runClassifiers(Traindataset,testDataset)

    '''
    print('\n Cross Validation : ')
    randindices = np.random.randint(0,Traindataset.shape[0],10000)
    Traindataset=Traindataset[randindices,:]
    Kobj=Kfold.Kfold(No_of_Folds=10)
    kf=Kobj.getFoldIndices(Traindataset)