Esempio n. 1
0
    def test_all_methods(self):
        x_cols = ["Lag2"]
        formula = "Direction~Lag2"
        # print self.df.shape[0]
        train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :]
        # print train_data.shape[0]
        """ (d) logistic"""
        model = smf.glm(formula, data=train_data, family=sm.families.Binomial())
        result = model.fit()
        test_data = self.df.ix[self.df["Year"] > 2008, :]
        probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]])))
        pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up")
        tp.output_table(pred_values.values, test_data[self.y_col].values)

        train_X = train_data[x_cols].values
        train_y = train_data[self.y_col].values
        test_X = test_data[x_cols].values
        test_y = test_data[self.y_col].values
        """ (e) LDA """
        lda_res = LDA().fit(train_X, train_y)
        pred_y = lda_res.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (f) QDA """
        qda_res = QDA().fit(train_X, train_y)
        pred_y = qda_res.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (g) KNN """
        clf = neighbors.KNeighborsClassifier(1, weights="uniform")
        clf.fit(train_X, train_y)
        pred_y = clf.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (h) logistic and LDA """
        """ (i) Is the purpose of the last question going through all methods with no direction?"""
Esempio n. 2
0
class SNPForecastingStrategy(Strategy):
	def __init__(self,symbol,bars):
		self.symbol=symbol
		self.bars=bars
		self.create_periods()
		self.fit_model()

	def create_periods(self):
		self.start_train=datetime.datetime(2001,1,10)
		self.start_test=datetime.datetime(2005,1,1)
		self.end_period=datetime.datetime(2005,12,31)

	def fit_model(self):
		snpret=create_lagged_series(self.symbol,self.start_train,self.end_period,lags=5)
		X=snpret[['Lag1','Lag2']]
		Y=snpret['Direction']
		X_train=X[X.index<self.start_test]
		Y_train=Y[Y.index<self.start_test]
		self.predictors=X[X.index>=self.start_test]
		self.model=QDA()
		self.model.fit(X_train,Y_train)

	def generate_signals(self):
		signals=pd.DataFrame(index=self.bars.index)
		signals['signal']=0.0
		signals['signal']=self.model.predict(self.predictors)
		signals['signal'][0:5]=0.0
		signals['positions']=signals['signal'].diff()
		return signals
Esempio n. 3
0
class RegularizedQDA:
  """
    Three types of regularization are possible:
    - regularized the covariance of a class toward the 
      average variance within that class
    - regularize the covariance of a class toward the
      pooled covariance across all classes
    - add some constant amount of variance to each feature
  """
  def __init__(self, avg_weight = 0.1, pooled_weight = 0, extra_variance = 0):
    self.avg_weight = avg_weight
    self.pooled_weight = pooled_weight
    self.extra_variance = extra_variance 
    self.model = QDA()
    
  def fit(self, X, Y):
    self.model.fit(X,Y)
    I = np.eye(X.shape[1])
    a = self.avg_weight
    p = self.pooled_weight
    ev = self.extra_variance 
    original_weight = 1.0 - a - p
    scaled_pooled_cov = p * np.cov(X.T)
    assert scaled_pooled_cov.shape == I.shape
    assert all([C.shape == I.shape for C in self.model.rotations])
    self.model.rotations = \
      [original_weight * C + \
       a * np.mean(np.diag(C)) * I + \
       scaled_pooled_cov + ev * I \
       for C in self.model.rotations] 
      
  def predict(self, X):
    return self.model.predict(X)
Esempio n. 4
0
class SNPForecastingStrategy(Strategy):
    """    
    Requires:
    symbol - A stock symbol on which to form a strategy on.
    bars - A DataFrame of bars for the above symbol."""

    def __init__(self, symbol, bars):
        self.symbol = symbol
        self.bars = bars
        self.create_periods()
        self.fit_model()

    def create_periods(self):
        """Create training/test periods."""
        self.start_train = datetime.datetime(2001,1,10)
        self.start_test = datetime.datetime(2005,1,1)
        self.end_period = datetime.datetime(2005,12,31)

    def fit_model(self):
        """Fits a Quadratic Discriminant Analyser to the
        US stock market index (^GPSC in Yahoo)."""
        # Create a lagged series of the S&P500 US stock market index
        snpret = create_lagged_series(self.symbol, self.start_train, 
                                      self.end_period, lags=5) 

        # Use the prior two days of returns as 
        # predictor values, with direction as the response
        X = snpret[["Lag1","Lag2"]]
        y = snpret["Direction"]

        # Create training and test sets
        X_train = X[X.index < self.start_test]
        y_train = y[y.index < self.start_test]

        # Create the predicting factors for use 
        # in direction forecasting
        self.predictors = X[X.index >= self.start_test]

        # Create the Quadratic Discriminant Analysis model
        # and the forecasting strategy
        self.model = QDA()
        self.model.fit(X_train, y_train)

    def generate_signals(self):
        """Returns the DataFrame of symbols containing the signals
        to go long, short or hold (1, -1 or 0)."""
        signals = pd.DataFrame(index=self.bars.index)
        signals['signal'] = 0.0       

        # Predict the subsequent period with the QDA model
        signals['signal'] = self.model.predict(self.predictors)

        # Remove the first five signal entries to eliminate
        # NaN issues with the signals DataFrame
        signals['signal'][0:5] = 0.0
        signals['positions'] = signals['signal'].diff() 

        return signals
Esempio n. 5
0
def performSVMClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
	"""
	SVM binary classification
	"""
	clf = QDA()
	clf.fit(X_train, y_train)

	accuracy = clf.score(X_test, y_test)
	return accuracy
Esempio n. 6
0
def performQDAClass(X_train, y_train, X_test, y_test):
    """
    Gradient Tree Boosting binary Classification
    """
    clf = QDA()
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    #auc = roc_auc_score(y_test, clf.predict(X_test))
    return accuracy
def qda(data,labels,n,v_type):
	train_data,train_labels,test_data,test_labels = split_data(data,labels,v_type)

	clf = QDA()
	clf.fit(train_data, train_labels)
	y_pred = clf.predict(test_data)
	pure_accuracy_rate = len([y_pred[x] for x in range(len(y_pred)) if y_pred[x] == test_labels[x]])/float(len(test_labels))
	report = classification_report(y_pred, test_labels, target_names=rock_names)
	cm = confusion_matrix(test_labels, y_pred)
	return pure_accuracy_rate,report,y_pred,test_labels,test_data,clf,cm,"QDA"
Esempio n. 8
0
	def QDA(self,membership,group_labels=None,std=3,ellipses=True,dpi=300,fontsize=10,MD=False,
	        legend=False, numbered=False,of='pdf'):
		self.type = 'QDA'
		membership = membership.astype(int)
		qda = QDA()
		self.fit = qda.fit(self.data, membership).predict(self.data)
		if ellipses:
			self.getEllipses(std,membership)
		self.PlotXDA(membership,group_labels=group_labels,std=std,ellipses=ellipses,dpi=dpi,
		             fontsize=fontsize,MD=MD,legend=legend,numbered=numbered,of=of)
		self.Store()
Esempio n. 9
0
def qda_predict(train_data, test_data, train_cat, xx, yy):
    # QDA CLASSIFIER
    qda_classifier = QDA()

    qda_fit = qda_classifier.fit(train_data, train_cat)
    predicted = qda_fit.predict(test_data)

    contour = qda_fit.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    contour = contour.reshape(xx.shape)

    return predicted, contour
Esempio n. 10
0
def get_QDA(Xtrain, Xtest, Ytrain, Ytest):
    qda = QDA()
    qda.fit(Xtrain,Ytrain)
#    predLabels = qda.predict(Xtest)
#    print("Classification Rate Test QDA: " + str(np.mean(Ytest==predLabels)*100) + " %")
    scores = np.empty((4))
    scores[0] = qda.score(Xtrain,Ytrain)
    scores[1] = qda.score(Xtest,Ytest)
    print('QDA, train: {0:.02f}% '.format(scores[0]*100))
    print('QDA, test: {0:.02f}% '.format(scores[1]*100))
    return qda
Esempio n. 11
0
def QuadraticDiscriminantAnalysis(x_train, y_train, x_cv, y_cv):
	"""
	Quadratic Discriminant Analysis Classifier
	"""
	print "Quadratic Discriminant Analysis"
	clfr = QDA()
	clfr.fit(x_train, y_train)
	#print 'Accuracy in training set: %f' % clfr.score(x_train, y_train)
	#if y_cv != None:
		#print 'Accuracy in cv set: %f' % clfr.score(x_cv, y_cv)
	
	return clfr
def train_qda(X, y, priors=None, reg_param=0.0):
    """
    Builds a quadratic discriminant analysis model

    Returns:
    clf: Fitted QDA model
    """
    clf = QDA(priors=priors,
              reg_param=reg_param)
    clf = clf.fit(X,y)
    print 'Quadratic Discriminant Analysis completed!'
    return clf
Esempio n. 13
0
File: qda.py Progetto: jbRegli/Higgs
def train_classifier(xTrain_s, yTrain_s, kwargs):
    """
    Train a naive baise classifier on xTrain and yTrain and return the trained
    classifier
    """
    if type(xTrain_s) != list:
        classifier_s = QDA(**kwargs)
        classifier_s.fit(xTrain_s, yTrain_s)

    else:
        classifier_s = train_classifier_8(xTrain_s, yTrain_s, kwargs)

    return classifier_s
 def fit_model(self):
   """Fits a Quadratic Discriminat Analyser to the US
   sock market index (^GPSC in Yahoo)."""
   # Create a laggged series of the S&P500 US stock market index
   
   snpret =  create_lagged_series(self.symbol, self.start_train,
               self.end_period, lags=5)
   
   # Use the prior two days of returns as 
   # predictor value, with direction as the response
   X = snpret[["Lag1", "Lag2"]]
   y = snpret["Direction"]
   
   # Create training and test sets
   X_train = X[X.index < self.start_test]
   y_train = y[y.index < self.start_test]
   
   # Create the prediciting factors for use
   # in direction forecasting.
   self.predictors = X[X.index >= self.start_test]
   
   # Create the Quadractic Discriminant Analysis model
   # and the forcasting strategy
   self.model = QDA()
   self.model.fit(X_train, y_train)
Esempio n. 15
0
def QDA_onFullDataset():
    #Parsing Full training dataset
    XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt')
    YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt')

    #Parsing Full testing dataset
    XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt')
    YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt')

    #Fitting data using QDA classifier
    clf = QDA()
    clf.fit(XFull, YFull.flatten())

    #Testing the results
    precision,recall,fscore = common.checkAccuracy(clf.predict(XFullTest),YFullTest,[1,2,3,4,5,6])
    print fscore
Esempio n. 16
0
    def train(self, classification_data, indices=None, settings_name=None, **kwargs):
        super(QDAClassifier, self).train(classification_data, indices, settings_name, **kwargs)
        indices = self.settings['indices']

        self.qda = QDA(**self.classifier_kwargs)

        self.qda.fit(classification_data.data[:, indices], classification_data.are_hurr_actual)
        return self
Esempio n. 17
0
	def fit_model(self):
		snpret=create_lagged_series(self.symbol,self.start_train,self.end_period,lags=5)
		X=snpret[['Lag1','Lag2']]
		Y=snpret['Direction']
		X_train=X[X.index<self.start_test]
		Y_train=Y[Y.index<self.start_test]
		self.predictors=X[X.index>=self.start_test]
		self.model=QDA()
		self.model.fit(X_train,Y_train)
def runQDA(fileNamaParam, trainizingSizeParam):
  # what percent will you use ? 
  testSplitSize = 1.0 - trainizingSizeParam
  testAndTrainData = IO_.giveTestAndTrainingData(fileNamaParam)
  trainData = testAndTrainData[0]
  testData = testAndTrainData[1]
  ### classification   
  ## get the test and training sets 
  featureSpace_train, featureSpace_test, vScore_train, vScore_test = cross_validation.train_test_split(trainData, testData, test_size=testSplitSize, random_state=0) 
  ## fire up the model   
  theQDAModel = QDA()
  theQDAModel.fit(featureSpace_train, vScore_train)
  thePredictedScores = theQDAModel.predict(featureSpace_test)
  #print "The original vector: "
  #print vScore_test
  #print "The predicted score vector: "
  #print thePredictedScores
  evalClassifier(vScore_test, thePredictedScores) 
Esempio n. 19
0
    def create_symbol_forecast_model(self):
        # Create a lagged series of the S&P500 US stock market index
        snpret = create_lagged_series(self.symbol_list[0], self.model_start_date, self.model_end_date, lags=5)

        # Use the prior two days of returns as predictor
        # values, with direction as the response
        X = snpret[["Lag1", "Lag2"]]
        y = snpret["Direction"]

        # Create training and test sets
        start_test = self.model_start_test_date
        X_train = X[X.index < start_test]
        X_test = X[X.index >= start_test]
        y_train = y[y.index < start_test]
        y_test = y[y.index >= start_test]

        model = QDA()
        model.fit(X_train, y_train)
        return model
Esempio n. 20
0
def QDAResult3D():

    norTrainNum, nor_isTraining = randTestData(t_data_perc, norDataNum)
    cnTrainNum, cn_isTraining = randTestData(t_data_perc, cnDataNum)
    isTraining =np.hstack((nor_isTraining, cn_isTraining))

    #Training QDA classifier
    clf = QDA()
    trained_clf = clf.fit(train_data[isTraining], labels[isTraining])

     #Using the remaining data for testing
    normal_pred = trained_clf.predict(normal_pt[nor_isTraining == False])
    trueneg_n = (normal_pred == 0).sum()
    specificity = trueneg_n/int(norDataNum - norTrainNum)

    cancer_pred = trained_clf.predict(cancer_pt[cn_isTraining == False])
    truepos_n = (cancer_pred == 1).sum()
    sensitivity = truepos_n/int(cnDataNum - cnTrainNum)
    
    return sensitivity, specificity
def qda(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans qda split_test")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        print X_train.shape, X_test.shape
        lda=QDA()
        lda.fit(X_train,y_train)
        y_pred = lda.predict(X_test)
        print "Quadratic Discriminant Analysis Accuracy "
        print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
        print "precision:", metrics.precision_score(y_test, y_pred)
        print "recall:", metrics.recall_score(y_test, y_pred)
        print "f1 score:", metrics.f1_score(y_test, y_pred)
        #LVLprint "\n"
        results = Output+"QDA_metrics_test.txt"
        file = open(results, "w")
        file.write("Quadratic Discriminant Analaysis estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
        file.write("\n")
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y_test)):
            file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
        file.close()
        title = "QDA %f"%test_size
        save = Output + "QDA_confusion_matrix"+"_%s.png"%test_size
        plot_confusion_matrix(y_test, y_pred,title,save)
    except (AttributeError):
        if configuration.normalization == 'normalize':
            results = Output+"Multinomial_NB_metrics_test.txt"
            file = open(results, "w")
            file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n")
            file.close()
    lvltrace.lvltrace("LVLSortie dans qda split_test")
Esempio n. 22
0
 def create_symbol_forecast_model(self):
     # Create a lagged series of the market index
     snpret = create_lagged_series( self.symbol_list[0], 
         self.model_start_date, self.model_end_date, lags = 5
         )
     
     # Use the prior X days of returns as predictor values with direction 
     # as the response.
     X = snpret[['Lag1','Lag2']]
     y = snpret["Direction"]
     
     # Create training and test sets
     start_test = self.model_start_test_date
     X_train = X[X.index < start_test]
     X_test = X[X.index >= start_test]
     y_train = y[y.index < start_test]
     y_test = y[y.index >= start_test]
     
     #model to use is Quadratic Discriminant Analysis
     model = QDA()
     model.fit(X_train, y_train)
     return model
Esempio n. 23
0
def performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
    """
    Quadratic Discriminant Analysis binary Classification
    """
    def replaceTiny(x):
        if (abs(x) < 0.0001):
            x = 0.0001
    
    X_train = X_train.apply(replaceTiny)
    X_test = X_test.apply(replaceTiny)
    
    clf = QDA()
    clf.fit(X_train, y_train)

    if savemodel == True:
        fname_out = '{}-{}.pickle'.format(fout, datetime.now())
        with open(fname_out, 'wb') as f:
            cPickle.dump(clf, f, -1)    
    
    accuracy = clf.score(X_test, y_test)
    
    return accuracy
Esempio n. 24
0
File: ch4.py Progetto: syting/esl
def table_4_1():
    """Reproduces table 4.1 in ESLii showing the training and test error rates
    for classifying vowels using different classification techniques. The
    sklearn implementation of logistic regression uses OvA instead of a true
    multinomial which likely accounts for the worse results
    """
    vowels_train = eslii.read_vowel_data()
    train_X = vowels_train[vowels_train.columns[1:]]
    train_y = vowels_train['y']
    vowels_test = eslii.read_vowel_data(train=False)
    test_X = vowels_test[vowels_test.columns[1:]]
    test_y = vowels_test['y']

    lda = LDA().fit(train_X, train_y)
    print "Linear discriminant analysis:  {:.2f} {:.2f}".format(
        1 - lda.score(train_X, train_y), 1 - lda.score(test_X, test_y))
    qda = QDA().fit(train_X, train_y)
    print "Quadratic discriminant analysis:  {:.2f} {:.2f}".format(
        1 - qda.score(train_X, train_y), 1 - qda.score(test_X, test_y))
    lr = LogisticRegression(C=1e30).fit(train_X, train_y)
    print "Logistic regression:  {:.2f} {:.2f}".format(
        1 - lr.score(train_X, train_y), 1 - lr.score(test_X, test_y))
Esempio n. 25
0
class QDAClassifier(Classifier):
    '''Quadratic Discriminant analysis classifier'''
    def __init__(self):
        super(QDAClassifier, self).__init__()
        self.fig = 20
        self.is_trainable = True
        self.is_trained = False

    def train(self, classification_data, indices=None, settings_name=None, **kwargs):
        super(QDAClassifier, self).train(classification_data, indices, settings_name, **kwargs)
        indices = self.settings['indices']

        self.qda = QDA(**self.classifier_kwargs)

        self.qda.fit(classification_data.data[:, indices], classification_data.are_hurr_actual)
        return self

    def classify(self, classification_data):
        super(QDAClassifier, self).classify(classification_data)
        indices = self.settings['indices']

        self.are_hurr_pred = self.qda.predict(classification_data.data[:, indices])
        return self.are_hurr_pred
Esempio n. 26
0
def QDA_onNonDynamicData():
    #Parsing Full training dataset
    XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt')
    YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt')

    #Parsing Full testing dataset
    XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt')
    YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt')

    #Getting the dataset associated with Non-Dynamic Activities on training 
    X_NonDynamic,Y_NonDynamic = common.getDataSubset(XFull,YFull.flatten(),[4,5,6])
    #Getting the dataset associated with Non-Dynamic Activities on testing
    X_NonDynamicTest,Y_NonDynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[4,5,6])

    #Fitting data using QDA classifier

    clf = QDA()
    clf.fit(X_NonDynamic, Y_NonDynamic.flatten())

    precision,recall,fscore = common.checkAccuracy(clf.predict(X_NonDynamicTest),Y_NonDynamicTest,[4,5,6])
    common.createConfusionMatrix(clf.predict(X_NonDynamicTest).flatten(),Y_NonDynamicTest.flatten(),[4,5,6])
    print fscore

    #Getting the dataset associated with Dynamic Activities on training 
    X_Dynamic,Y_Dynamic = common.getDataSubset(XFull,YFull.flatten(),[1,2,3])
    #Getting the dataset associated with Dynamic Activities on testing
    X_DynamicTest,Y_DynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[1,2,3])
    print len(X_DynamicTest),len(Y_DynamicTest)

    #Fitting data using QDA classifier
    clf = QDA()
    clf.fit(X_Dynamic, Y_Dynamic.flatten())

    precision,recall,fscore = common.checkAccuracy(clf.predict(X_DynamicTest),Y_DynamicTest,[1,2,3])
    common.createConfusionMatrix(clf.predict(X_DynamicTest).flatten(),Y_DynamicTest.flatten(),[1,2,3])

    print fscore
Esempio n. 27
0
def checkeachClassfier(train_x, train_y, test_x, test_y):
    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(class_weight='auto'),
        SVC(gamma=2, C=1),
        DecisionTreeClassifier(max_depth=5),
        DecisionTreeClassifier(class_weight='auto'),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        RandomForestClassifier(class_weight='auto'),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA()
    ]

    classtitle = [
        "KNeighborsClassifier", "SVC", "SVC weighted", "SVC(gamma=2, C=1)",
        "DecisionTreeClassifier", "DecisionTreeClassifier weighted",
        "RandomForestClassifier", "RandomForestClassifier weighted",
        "AdaBoostClassifier", "GaussianNB", "LDA", "QDA"
    ]

    for i in range(len(classtitle)):
        try:
            ctitle = classtitle[i]
            clf = classifiers[i]
            clf.fit(train_x, train_y)
            train_pdt = clf.predict(train_x)
            MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt)
            print ctitle + ":"
            print "MCC, Acc_p , Acc_n, Acc_all(train): "
            print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n),
                                   str(Acc_all))
            test_pdt = clf.predict(test_x)
            MCC, Acc_p, Acc_n, Acc_all = get_Accs(test_y, test_pdt)
            print "MCC, Acc_p , Acc_n, Acc_all(test): "
            print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n),
                                   str(Acc_all))
            fn = "submission_%s.csv" % ctitle
            fout = open(fn, 'w')
            fout.write("ID,target\n")
            for index, eachline in enumerate(test_pdt):
                fout.write("%s,%s\n" %
                           (str(int(test_x[index][0])), str(test_pdt[index])))
            fout.close()
        except:
            print ctitle + ": error"
        print
Esempio n. 28
0
    def fit_model(self):
        """Fits a Quadratic Discriminant Analyser to the
        US stock market index (^GPSC in Yahoo)."""
        # Create a lagged series of the S&P500 US stock market index
        snpret = create_lagged_series(self.symbol, self.start_train, 
                                      self.end_period, lags=5) 

        # Use the prior two days of returns as 
        # predictor values, with direction as the response
        X = snpret[["Lag1","Lag2"]]
        y = snpret["Direction"]

        # Create training and test sets
        X_train = X[X.index < self.start_test]
        y_train = y[y.index < self.start_test]

        # Create the predicting factors for use 
        # in direction forecasting
        self.predictors = X[X.index >= self.start_test]

        # Create the Quadratic Discriminant Analysis model
        # and the forecasting strategy
        self.model = QDA()
        self.model.fit(X_train, y_train)
def get_QDA(Xtrain, Xtest, Ytrain, Ytest):
    qda = QDA()
    qda.fit(Xtrain, Ytrain)
    #    predLabels = qda.predict(Xtest)
    #    print("Classification Rate Test QDA: " + str(np.mean(Ytest==predLabels)*100) + " %")
    scores = np.empty((4))
    scores[0] = qda.score(Xtrain, Ytrain)
    scores[1] = qda.score(Xtest, Ytest)
    print('QDA, train: {0:.02f}% '.format(scores[0] * 100))
    print('QDA, test: {0:.02f}% '.format(scores[1] * 100))
    return qda
Esempio n. 30
0
def get_LDA_performance(test_df, X_std, y):
    X_test = test_df.ix[:, 'x.1':'x.10'].values
    X_std_test = StandardScaler().fit_transform(X_test)
    y_test = test_df.ix[:, 'y'].values

    lda_scores_training = []
    lda_scores_test = []

    qda_scores_training = []
    qda_scores_test = []

    knn_scores_training = []
    knn_scores_test = []

    for d in range(1, 11):
        lda = LDA(n_components=d)
        Xred_lda_training = lda.fit_transform(X_std, y)
        Xred_lda_test = lda.transform(X_std_test)

        lda_model = LDA()
        lda_model.fit(Xred_lda_training, y)

        qda_model = QDA()
        qda_model.fit(Xred_lda_training, y)

        knn_model = KNeighborsClassifier(n_neighbors=10)
        knn_model.fit(Xred_lda_training, y)

        lda_scores_training.append(1 - lda_model.score(Xred_lda_training, y))
        lda_scores_test.append(1 - lda_model.score(Xred_lda_test, y_test))

        qda_scores_training.append(1 - qda_model.score(Xred_lda_training, y))
        qda_scores_test.append(1 - qda_model.score(Xred_lda_test, y_test))

        knn_scores_training.append(1 - knn_model.score(Xred_lda_training, y))
        knn_scores_test.append(1 - knn_model.score(Xred_lda_test, y_test))

    plt.plot(range(10), lda_scores_training, 'r--', label="Train data")
    plt.plot(range(10), lda_scores_test, 'b--', label="Test data")
    plt.title("LDA vs LDA")
    plt.xlabel('k')
    plt.ylabel('Score')
    plt.show()

    plt.plot(range(10), qda_scores_training, 'r--', label="Train data")
    plt.plot(range(10), qda_scores_test, 'b--', label="Test data")
    plt.title("QDA vs LDA")
    plt.show()

    plt.plot(range(10), knn_scores_training, 'r--', label="Train data")
    plt.plot(range(10), knn_scores_test, 'b--', label="Test data")
    plt.title("KNN vs LDA")
    plt.show()
def classifier_comparison(X, y):
    """
    分类器比较

    Args:
        X: training samples, size=[n_samples, n_features]
        y: class labels, size=[n_samples, 1]
    Returns:
        None
    """
    from sklearn import grid_search
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.lda import LDA
    from sklearn.qda import QDA
    import scipy

    # Exhaustive Grid Search
    exhaustive_parameters = {'kernel':['rbf'], 'C':[1, 10, 100, 1000], 'gamma':[1e-3, 1e-4]}
    clf_SVC_exhaustive = grid_search.GridSearchCV(SVC(), exhaustive_parameters)
    # Randomized Parameter Optimization
    randomized_parameter = {'kernel':['rbf'], 'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1)}
    clf_SVC_randomized = grid_search.RandomizedSearchCV(SVC(), randomized_parameter)

    names = ["Linear SVM", "RBF SVM",
             "RBF SVM with Grid Search", "RBF SVM with Random Grid Search", 
             "Decision Tree", "Random Forest", 
             "AdaBoost", "Naive Bayes", "LDA", "QDA"]
    classifiers = [
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        clf_SVC_exhaustive,
        clf_SVC_randomized,
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA()]

    for name, clf in zip(names, classifiers):
        logger.info('Use %s:' % (name))
        train_classifier(clf, X, y)
    def supervised_classification(
            self,
            input,
            label,
            classification_method='RandomForestClassifier'):

        assert classification_method in set([
            'KNeighborsClassifier', 'SVC', 'DecisionTreeClassifier',
            'RandomForestClassifier', 'AdaBoostClassifier', 'GaussianNB',
            'LDA', 'QDA'
        ])

        # Generate the clasifier:
        if classification_method == 'KNeighborsClassifier':
            from sklearn.neighbors import KNeighborsClassifier
            classifier = KNeighborsClassifier(n_neighbors=10)
        elif classification_method == 'SVC':
            from sklearn.svm import SVC
            classifier = SVC(gamma=2, C=1)
        elif classification_method == 'DecisionTreeClassifier':
            from sklearn.tree import DecisionTreeClassifier
            classifier = DecisionTreeClassifier(max_depth=5)
        elif classification_method == 'AdaBoostClassifier':
            from sklearn.ensemble import AdaBoostClassifier
            classifier = AdaBoostClassifier()
        elif classification_method == 'GaussianNB':
            from sklearn.naive_bayes import GaussianNB
            classifier = GaussianNB()
        elif classification_method == 'LDA':
            from sklearn.lda import LDA
            classifier = LDA()
        elif classification_method == 'QDA':
            from sklearn.qda import QDA
            classifier = QDA()
        else:
            from sklearn.ensemble import RandomForestClassifier
            classifier = RandomForestClassifier(max_depth=5,
                                                n_estimators=10,
                                                max_features=1)

        # Train classifier
        classifier.fit(input, label)

        return classifier
Esempio n. 33
0
def checkeachClassfier(train_x, train_y, test_x, test_y):
    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(class_weight='auto'),
        SVC(gamma=2, C=1),
        DecisionTreeClassifier(max_depth=5),
        DecisionTreeClassifier(class_weight='auto'),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        RandomForestClassifier(class_weight='auto'),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA()
    ]

    classtitle = [
        "KNeighborsClassifier", "SVC", "SVC weighted", "SVC(gamma=2, C=1)",
        "DecisionTreeClassifier", "DecisionTreeClassifier weighted",
        "RandomForestClassifier", "RandomForestClassifier weighted",
        "AdaBoostClassifier", "GaussianNB", "LDA", "QDA"
    ]

    for i in range(len(classtitle)):
        try:
            ctitle = classtitle[i]
            clf = classifiers[i]
            clf.fit(train_x, train_y)
            train_pdt = clf.predict(train_x)
            MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt)
            print ctitle + ":"
            print "MCC, Acc_p , Acc_n, Acc_all(train): "
            print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n),
                                   str(Acc_all))
            test_pdt = clf.predict(test_x)
            MCC, Acc_p, Acc_n, Acc_all = get_Accs(test_y, test_pdt)
            print "MCC, Acc_p , Acc_n, Acc_all(test): "
            print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n),
                                   str(Acc_all))
        except:
            print ctitle + ": error"
        print
def main():

    (X, Y, Ynames) = load_magic_data()
    X = StandardScaler().fit_transform(X)
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=None)
    C = 5.0

    classifiers = {
        'L1 logistic': LogisticRegression(C=C, penalty='l1'),
        'L2 logistic': LogisticRegression(C=C, penalty='l2'),
        'KNN': KNeighborsClassifier(n_neighbors=11),
        'NB': GaussianNB(),
        'RF5': RandomForestClassifier(n_estimators=5),
        'RF50': RandomForestClassifier(n_estimators=50),
        'AdaBoost': AdaBoostClassifier(),
        'LDA': LDA(),
        'QDA': QDA()
    }

    plt.figure(figsize=(8, 8))

    n_classifiers = len(classifiers)
    for index, (name, clf) in enumerate(classifiers.iteritems()):
        clf.fit(Xtrain, Ytrain)
        probs = clf.predict_proba(Xtest)
        fpr, tpr, thresholds = roc_curve(Ytest, probs[:, 1])
        roc_auc = auc(fpr, tpr)
        print 'For model', name, 'accuracy =', clf.score(Xtest, Ytest)

        plt.plot(fpr, tpr, label='%s (area = %0.2f)' % (name, roc_auc))

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()
def evaluate(data, targets):
    print "Creating models..."
    models = []
    models.append(LinearSVC())
    models.append(SVC(kernel='rbf'))
    models.append(GaussianNB())
    models.append(LDA())
    models.append(QDA())
    models.append(LogisticRegression())
    models.append(KNeighborsRegressor())
    models.append(
        RandomForestClassifier(n_estimators=100,
                               criterion="entropy",
                               random_state=1234,
                               n_jobs=-1))

    if sparse.issparse(data):
        data = data.toarray()

    mc = ModelComparison(data, targets, folds=10, numCV=3, models=models)
    mc.evaluate()
Esempio n. 36
0
def evaluate(data, targets):
    prior = numpy.bincount(y.astype(int)) / float(len(targets))
    models = [
        LDA(priors=prior),
        SVC(probability=True, class_weight="auto", kernel="linear"),
        LogisticRegression(class_weight="auto"),
        GaussianNB(),
        KNeighborsClassifier(),
        QDA(priors=prior),
        RandomForestClassifier(n_estimators=100,
                               criterion="entropy",
                               n_jobs=-1,
                               random_state=123456),
        SVC(probability=True, class_weight="auto")
    ]

    model_names = [
        "LDA", "Linear SVM", "Logistic Regression", "Naive Bayes", "k-NN",
        "QDA", "Random Forest", "SVM w/ RBF"
    ]

    # evaluate using ModelEvaluation class
    mevaluator = model_evaluation.TenFoldCrossValidation(
        data=data,
        targets=targets,
        models=models,
        model_names=model_names,
        scale=True)

    start = time.time()
    caa_eval = mevaluator.evaluate(metrics.class_averaged_accuracy_score)
    for key, value in caa_eval.iteritems():
        model_str = key.split("(")[0]
        print model_str, (str(numpy.around(numpy.mean(value), decimals=3)) +
                          " (" +
                          str(numpy.around(numpy.std(value), decimals=3)) +
                          ")")
    mevaluator.evaluate_roc()
    print "Overall running time:", (time.time() - start)
Esempio n. 37
0
def test_feature_splitter(size=2000):
    X, y = commonutils.generate_sample(size, 10, distance=0.5)
    X['column0'] = numpy.clip(
        numpy.array(X['column0']).astype(numpy.int), -2, 2)
    trainX, testX, trainY, testY = commonutils.train_test_split(X, y)
    base_estimators = {'rf': RandomForestClassifier()}
    splitter = FeatureSplitter('column0',
                               base_estimators=base_estimators,
                               final_estimator=RandomForestClassifier())
    splitter.fit(trainX, trainY)

    print(splitter.score(testX, testY))
    print(RandomForestClassifier().fit(trainX, trainY).score(testX, testY))
    print(
        DumbSplitter('column0', base_estimator=RandomForestClassifier()).fit(
            trainX, trainY).score(testX, testY))
    chain = OrderedDict()
    chain['QDA'] = QDA()
    chain['LDA'] = LDA()
    chain['RF'] = RandomForestClassifier()
    print(ChainClassifiers(chain).fit(trainX, trainY).score(testX, testY))
    print(LDA().fit(trainX, trainY).score(testX, testY))
Esempio n. 38
0
 def __init__(self, training_path, testing_path):
     self.training_path = training_path
     self.testing_path = testing_path
     self.training_features = None
     self.testing_features = None
     self.training_image_list = []
     self.testing_image_list = []
     self.training_labels = []
     self.testing_labels = []
     self.predicted_testing_labels = []
     self.class_map = {}
     self.n_classes = len(os.listdir(os.path.join('.', 'data', 'training')))
     self.classifiers = {
         'knn':
         KNeighborsClassifier(3),
         'svm_linear':
         SVC(kernel="linear", C=0.025),
         'svm':
         SVC(gamma=2, C=1),
         'tree':
         DecisionTreeClassifier(max_depth=5),
         'rf':
         RandomForestClassifier(max_depth=5,
                                n_estimators=10,
                                max_features=1),
         'adb':
         AdaBoostClassifier(),
         'gauss':
         GaussianNB(),
         'lda':
         LDA(),
         'qda':
         QDA(),
         'ann':
         neuralNetwork(self.n_classes)
     }
     self.get_training_image_list()
     self.get_testing_image_list()
Esempio n. 39
0
def random_methods(data_train1,target_train1):
    rng = np.random.RandomState(96235)
    names = ["SGD", "Nearest Neighbors", "ensembel","Decision Tree","Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
    classifiers = [
        SGDClassifier(loss='hinge', penalty='l2', alpha=0.0005, n_iter=200, random_state=42, n_jobs=-1, average=True),
        KNeighborsClassifier(10),
        AdaBoostRegressor(DecisionTreeRegressor(max_depth=25),n_estimators=300, random_state=rng),
        DecisionTreeClassifier(max_depth=11),
        RandomForestClassifier(max_depth=21, n_estimators=21, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA()
    ]
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        print("Fitting " + name + "...")
        clf.fit(data_train1, target_train1)
        print("Predicting...")
        score = clf.score(data_test, target_test)
        print(score)
        predicted_test = clf.fit(data_train1, target_train1).predict(data_test)
        print(metrics.classification_report(target_test, predicted_test))
Esempio n. 40
0
 def BuildModel(self, data, labels):
     # Create and train the classifier.
     qda = SQDA()
     qda.fit(data, labels)
     return qda
Esempio n. 41
0
# Filter out all the warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# All the ML algorithms we're going to try
algorithms = [
    (RandomForestClassifier(max_depth=5,
                            n_jobs=-1,
                            n_estimators=10,
                            max_features=10), "Random Forest"),
    (GaussianNB(), "Gaussian Naive Bayes"),
    (LogisticRegression(), "Logistic Regression"),
    (LinearSVC(), "Support Vector Machine"),
    (DecisionTreeClassifier(max_depth=10), "Decision Tree"), (QDA(), "QDA"),
    (GradientBoostingClassifier(), "BOOSTING!!!"),
    (Pipeline(steps=[('rbm', BernoulliRBM()), ('logistic',
                                               LogisticRegression())]),
     "Bernoulli Neural Network Combo Logit")
]


# Create dataset of (X, y) where X is a list of (answer, question), y is labels.
#	 In this case, y is the label of whether the answer is correct for the question
def generateDataset(datafile):
    generatedFile = "../data/cayman_distance_data/distanceDataset.pickle"

    # No need to generate twice.
    if (isfile(generatedFile)):
        return loadPickle(generatedFile)
Esempio n. 42
0
from varplot import *
from sklearn.qda import QDA
import numpy as np
import pickle

data = np.load("sd.npy")
truth = np.load("truth.npy")

testdata = np.load("sd_test.npy")
testtruth = np.load("truth_test.npy")

print(len(data))

clf = QDA()
clf.fit(data,truth)

output=open("qda.pkl",'wb')

pickle.dump(clf,output)

output.close()

print(clf.score(data,truth))
print(clf.score(testdata,testtruth))

s = np.where(truth == 2)[0]
st = np.where(testtruth == 2)[0]
g = np.where(truth == 1)[0]
gt = np.where(testtruth == 1)[0]
print("Stars")
print(clf.score(data[s],truth[s]))
Esempio n. 43
0
class Classifier(BiPlot):
    '''
    To hold methods and data to support classification of measurements in a STOQS database.
    See http://scikit-learn.org/stable/auto_examples/plot_classifier_comparison.html
    '''
    classifiers = {
        'Nearest_Neighbors':
        KNeighborsClassifier(3),
        'Linear_SVM':
        SVC(kernel="linear", C=0.025),
        'RBF_SVM':
        SVC(gamma=2, C=1),
        'Decision_Tree':
        DecisionTreeClassifier(max_depth=5),
        'Random_Forest':
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        'AdaBoost':
        AdaBoostClassifier(),
        'Naive_Bayes':
        GaussianNB(),
        'LDA':
        LDA(),
        'QDA':
        QDA()
    }

    def getActivity(self, mpx, mpy):
        '''
        Return activity object which MeasuredParameters mpx and mpy belong to
        '''
        meas = Measurement.objects.using(self.args.database).filter(
            measuredparameter__id__in=(mpx, mpy)).distinct()
        acts = Activity.objects.using(self.args.database).filter(
            instantpoint__measurement__measuredparameter__id__in=(
                mpx, mpy)).distinct()
        if not acts:
            print "acts = %s" % acts
            raise Exception('Not exactly 1 activity returned with SQL = \n%s' %
                            str(acts.query))
        else:
            return acts[0]

    def saveCommand(self, doOption):
        '''
        Save the command executed to a Resource and return it for the doXxxx() method to associate it with the resources it creates
        '''

        rt, created = ResourceType.objects.using(
            self.args.database).get_or_create(name=LABEL,
                                              description='metadata')
        r, created = Resource.objects.using(self.args.database).get_or_create(
            name=COMMANDLINE, value=self.commandline, resourcetype=rt)

        return r

    def saveLabelSet(self, clResource, label, x_ids, y_ids, description,
                     typeName, typeDescription):
        '''
        Save the set of labels in MeasuredParameterResource. Accepts 2 input vectors. (TODO: generalize to N input vectors);
        description is used to describe the criteria for assigning this label. The typeName and typeDecription may be used to
        refer to the grouping, and associate via the grouping the other labels made in the heuristic applied.
        '''
        try:
            # Label
            rt, created = ResourceType.objects.using(
                self.args.database).get_or_create(name=typeName,
                                                  description=typeDescription)
            r, created = Resource.objects.using(
                self.args.database).get_or_create(name=LABEL,
                                                  value=label,
                                                  resourcetype=rt)
            # Label's description
            rdt, created = ResourceType.objects.using(
                self.args.database).get_or_create(name=LABEL,
                                                  description='metadata')
            rd, created = Resource.objects.using(
                self.args.database).get_or_create(name=DESCRIPTION,
                                                  value=description,
                                                  resourcetype=rdt)
            rr = ResourceResource(fromresource=r, toresource=rd)
            rr.save(using=self.args.database)
            # Associate with commandlineResource
            ResourceResource.objects.using(self.args.database).get_or_create(
                fromresource=r, toresource=clResource)

        except IntegrityError as e:
            print e
            print "Ignoring"

        # Associate MeasuredParameters with Resource
        if self.args.verbose:
            print "  Saving %d values of '%s' with type '%s'" % (
                len(x_ids), label, typeName)
        for x_id, y_id in zip(x_ids, y_ids):
            a = self.getActivity(x_id, y_id)
            mp_x = MeasuredParameter.objects.using(
                self.args.database).get(pk=x_id)
            mp_y = MeasuredParameter.objects.using(
                self.args.database).get(pk=y_id)
            mpr_x, created = MeasuredParameterResource.objects.using(
                self.args.database).get_or_create(activity=a,
                                                  measuredparameter=mp_x,
                                                  resource=r)
            mpr_y, created = MeasuredParameterResource.objects.using(
                self.args.database).get_or_create(activity=a,
                                                  measuredparameter=mp_y,
                                                  resource=r)

    def removeLabels(self,
                     labeledGroupName,
                     label=None,
                     description=None,
                     commandline=None):
        '''
        Delete labeled MeasuredParameterResources that have ResourceType.name=labeledGroupName (such as 'Labeled Plankton').  
        Restrict deletion to the other passed in options, if specified: label is like 'diatom', description is like 
        'Using Platform dorado, Parameter {'salinity': ('33.65', '33.70')} from 20130916T124035 to 20130919T233905'
        (commandline is too long to show in this doc string - see examples in usage note).  Note: Some metadatda
        ResourceTypes will not be removed even though the Resources that use them will be removed.
        '''
        # Remove MeasuredParameter associations with Resource (Labeled data)
        mprs = MeasuredParameterResource.objects.using(
            self.args.database).filter(
                resource__resourcetype__name=labeledGroupName).select_related(
                    depth=1)
        if label:
            mprs = mprs.filter(resource__name=LABEL, resource__value=label)

        if self.args.verbose > 1:
            print "  Removing MeasuredParameterResources with type = '%s' and label = %s" % (
                labeledGroupName, label)

        rs = []
        for mpr in mprs:
            rs.append(mpr.resource)
            mpr.delete(using=self.args.database)

        # Remove Resource associations with Resource (label metadata), make rs list distinct with set() before iterating on the delete()
        if label and description and commandline:
            rrs = ResourceResource.objects.using(self.args.database).filter(
                (Q(fromresource__name=LABEL) & Q(fromresource__value=label))
                & ((Q(toresource__name=DESCRIPTION)
                    & Q(toresource__value=description))
                   | (Q(toresource__name=COMMANDLINE)
                      & Q(toresource__value=commandline))))

            if self.args.verbose > 1:
                print "  Removing ResourceResources with fromresource__value = '%s' and toresource__value = '%s'" % (
                    label, description)

            for rr in rrs:
                rr.delete(using=self.args.database)

        else:
            if self.args.verbose > 1:
                print "  Removing Resources associated with labeledGroupName = %s'" % labeledGroupName

            for r in set(rs):
                r.delete(using=self.args.database)

    def createLabels(self, labeledGroupName):
        '''
        Using discriminator, mins, and maxes label MeasuredParameters in the database so that we can do supervised learning
        '''
        sdt = datetime.strptime(self.args.start, '%Y%m%dT%H%M%S')
        edt = datetime.strptime(self.args.end, '%Y%m%dT%H%M%S')

        commandlineResource = self.saveCommand('createLabels')

        for label, min, max in zip(self.args.labels, self.args.mins,
                                   self.args.maxes):
            # Multiple discriminators are possible...
            pvDict = {self.args.discriminator: (min, max)}
            if self.args.verbose:
                print "Making label '%s' with discriminator %s" % (label,
                                                                   pvDict)

            try:
                x_ids, y_ids, xx, yy, points = self._getPPData(
                    sdt,
                    edt,
                    self.args.platform,
                    self.args.inputs[0],
                    self.args.inputs[1],
                    pvDict,
                    returnIDs=True,
                    sampleFlag=False)
            except NoPPDataException, e:
                print e

            if self.args.verbose:
                print "  (%d, %d) MeasuredParameters returned from database %s" % (
                    len(x_ids), len(y_ids), self.args.database)

            description = 'Using Platform %s, Parameter %s from %s to %s' % (
                self.args.platform, pvDict, self.args.start, self.args.end)

            if self.args.clobber:
                self.removeLabels(labeledGroupName, label, description,
                                  commandlineResource.value)

            self.saveLabelSet(
                commandlineResource, label, x_ids, y_ids, description,
                labeledGroupName,
                'Labeled with %s as discriminator' % self.args.discriminator)
Esempio n. 44
0
def cal_val_analysis(cols=None):
    '''Calibrates/validates classifiers from 1990 to 2009

    Calibrates on even years, validates on odd
    Trains all variants of the scikit.learn SGD classifier, as well as LDA/QDA.
    SGD pre-scales data
    Calc's FP/FN/TP/TN + sens, ppv and sens*ppv.
    '''
    # Load the data.
    if cols == None:
        cols = COLS

    cal_years = range(1990, 2010, 2)
    val_years = range(1991, 2010, 2)

    cal_cfm = get_results(cal_years, settings.RESULTS)
    val_cfm = get_results(val_years, settings.RESULTS)

    # Set up classifers.
    classifiers = []
    scalers = []
    for sgd_loss in SGD_LOSSES:
        for sgd_penalty in SGD_PENALTY:
            sgd = SGDClassifier(loss=sgd_loss, penalty=sgd_penalty)
            sgd_scaler = StandardScaler()
            classifiers.append(sgd)
            scalers.append(sgd_scaler)

    classifiers.append(LDA())
    scalers.append(None)

    classifiers.append(QDA())
    scalers.append(None)

    # Perform classification.
    for classifier, scaler in zip(classifiers, scalers):
        print('Analysing with classifier {}'.format(classifier))

        try:
            for cfm, is_cal in [(cal_cfm, True), (val_cfm, False)]:
                if is_cal:
                    print('CAL')
                else:
                    print('VAL')

                data = cfm[cols].values.astype(np.float32)
                are_hurr = ~cfm.bt_wind.isnull() & (cfm.is_hurr)

                if is_cal:
                    if scaler is not None:
                        scaler.fit(data)
                        scaled_data = scaler.transform(data)
                    else:
                        scaled_data = data

                    fit(classifier, scaled_data, are_hurr)
                else:
                    if scaler is not None:
                        scaled_data = scaler.transform(data)
                    else:
                        scaled_data = data

                print(', '.join(cols))
                predict(classifier, scaled_data, are_hurr)
            print('')
        except Exception, e:
            print('Error with classifier {}'.format(classifier))
            print(e)
Esempio n. 45
0
import numpy as np
import pickle
import os
from preprocessing import *

classifiers = {
    'knn': KNeighborsClassifier(3),
    'svm_linear': SVC(kernel="linear", C=0.025),
    'svm': SVC(gamma=2, C=1),
    'tree': DecisionTreeClassifier(max_depth=5),
    'rf': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    'adb': AdaBoostClassifier(),
    'etc': ExtraTreesClassifier(),
    'gauss': GaussianNB(),
    'lda': LDA(),
    'qda': QDA(),
    # 'ann': neuralNetwork( 16 )
}


def feature_selection(training_data, target_data, test_data):
    X1 = np.array(training_data).astype(np.float)
    y = np.array(target_data).astype(np.float)
    X1_test = np.array(test_data).astype(np.float)
    features = training_data.columns
    print features
    X_index = np.arange(X1.shape[-1])
    ''' Variance Threshold '''
    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    X1 = sel.fit_transform(X1)
    X1_test = sel.transform(X1_test)
Esempio n. 46
0
File: QDA.py Progetto: dmtrkl/Labs
__author__ = 'chaouki'

from sklearn.qda import QDA
import numpy as np
from sklearn import cross_validation

X = []
y = []
with open(".\\File_Features.txt", "r") as g:
    line = g.readline()  # Read a new line
    while line:  # while line is not empty
        tmp = line.split(",")
        X.append(map(float, tmp[:-1]))
        y.append(tmp[-1].replace("\n", ""))
        line = g.readline()

# Features matrix
X = np.array(X)  # Data vector
y = np.array(y)  # Label vector
n_samples = len(X)

rate = "{0:.2f}".format(
    np.mean(
        cross_validation.cross_val_score(
            QDA(), X, y, cv=10, scoring='f1_weighted') * 100))
print "Recognition Rate with Quadratic Discriminant Analysis (QDA) classifier :", rate, "%"
from sklearn.lda import LDA
from sklearn.qda import QDA
from supervised_pca import SupervisedPCAClassifier

total_range = 100
performances = {}

names = [
    "LDA", "QDA", "SuperPCA thres=0", "SuperPCA thres=0.3",
    "SuperPCA thres=0.7"
]
ncomponents = {names[2]: [], names[3]: [], names[4]: []}

classifiers = [
    LDA(),
    QDA(),
    SupervisedPCAClassifier(threshold=0),
    SupervisedPCAClassifier(threshold=0.3),
    SupervisedPCAClassifier(threshold=0.7)
]

for name in names:
    performances[name] = []

    # iterate over classifiers

for i in range(1, total_range):
    X, y = make_classification(n_features=i * 10,
                               n_redundant=i * 5,
                               n_informative=i,
                               random_state=1,
Esempio n. 48
0
    ell.set_clip_box(splot.bbox)
    ell.set_alpha(0.5)
    splot.add_artist(ell)
    splot.set_xticks(())
    splot.set_yticks(())

def plot_lda_cov(lda, splot):
    plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')
    plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')

def plot_qda_cov(qda, splot):
    plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red')
    plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue')

for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):
    # LDA 
    lda = LDA(solver='svd', store_covariance=True)
    y_pred = lda.fit(X, y).predict(X)
    splot = plot_data(lda, X, y, y_pred, fig_index = 2 * i + 1)
    plot_lda_cov(lda, splot)
    plt.axis('tight')

    # QDA 
    qda = QDA()
    y_pred = qda.fit(X, y, store_covariances=True).predict(X)
    splot = plot_data(qda, X, y, y_pred, fig_index = 2 * i + 2)
    plot_qda_cov(qda, splot)
    plt.axis('tight')
plt.suptitle('LDA vs QDA')
plt.show()
Esempio n. 49
0
N_tot = len(y)
N_st = np.sum(y == 0)
N_rr = N_tot - N_st
N_train = len(y_train)
N_test = len(y_test)
N_plot = 5000 + N_rr

#----------------------------------------------------------------------
# perform QDA
classifiers = []
predictions = []
Ncolors = np.arange(1, X.shape[1] + 1)

for nc in Ncolors:
    clf = QDA()
    clf.fit(X_train[:, :nc], y_train)
    y_pred = clf.predict(X_test[:, :nc])

    classifiers.append(clf)
    predictions.append(y_pred)

predictions = np.array(predictions)

completeness, contamination = completeness_contamination(predictions, y_test)

print "completeness", completeness
print "contamination", contamination

#------------------------------------------------------------
# Compute the decision boundary

# will hold all ids and all predictions
all_ids = []
all_predictions_lda = []
all_predictions_qda = []
all_predictions_lr = []
all_predictions_avg = []
subsample = 100
num_subjects = 13
num_series = 9
human_labels = ['HandStart', 'FirstDigitTouch', 'BothStartLoadPhase', 'LiftOff', 'Replace', 'BothReleased']


lr = LogisticRegression()
qda = QDA()
lda = LDA()

for subject_id in range(1,num_subjects):
  y_raw = []
  raw = []

  # read in training data
  for series_id in range(1,num_series):
    data,labels = prepare_data_train(subject_id, series_id)
    raw.append(data)
    y_raw.append(labels)

  # concatanenate the data sets into one dataframe
  X = pd.concat(raw)
  y = pd.concat(y_raw)
Esempio n. 51
0
    N_tot = len(y)
    N_st = np.sum(y == 0)
    N_rr = N_tot - N_st
    N_train = len(y_train)
    N_test = len(y_test)
    N_plot = 5000 + N_rr

    #----------------------------------------------------------------------
    # perform QDA
    classifiers = []
    predictions = []
    Ncolors = np.arange(1, X.shape[1] + 1)

    for nc in Ncolors:
        clf = QDA()
        clf.fit(X_train[:, :nc], y_train)
        y_pred = clf.predict(X_test[:, :nc])

        classifiers.append(clf)
        predictions.append(y_pred)

    predictions = np.array(predictions)

    completeness, contamination = completeness_contamination(
        predictions, y_test)

    print "completeness", completeness
    print "contamination", contamination

    #------------------------------------------------------------
Esempio n. 52
0
df2 = pd.read_excel('feat.xlsx', sheetname=1, header=1)
x2 = np.array(df2[feature_x]).reshape(-1, 1)
y2 = np.array(df2[feature_y]).reshape(-1, 1)
normal_pt = np.hstack([x2, y2])

# In[48]:

#Sort given training data with corresponding labels
nor_n = np.zeros(int(normal_pt.size / normal_pt.ndim))
can_n = np.ones(int(cancer_pt.size / cancer_pt.ndim))
labels = np.hstack((nor_n, can_n))
train_data = np.vstack((normal_pt, cancer_pt))

# In[49]:

clf = QDA()
trained_clf = clf.fit(train_data, labels)
normal_pred = trained_clf.predict(normal_pt)
trueneg_n = (normal_pred == 0).sum()
specificity = trueneg_n / int(normal_pt.size / normal_pt.ndim)

# In[50]:

cancer_pred = trained_clf.predict(cancer_pt)
truepos_n = (cancer_pred == 1).sum()
sensitivity = truepos_n / int(cancer_pt.size / cancer_pt.ndim)

# In[51]:

#Generate grids for the entire plot
if inRedox:
Esempio n. 53
0
from sklearn.qda import QDA

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA()]

X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
            ]

figure = pl.figure(figsize=(27, 9))
i = 1
# iterate over datasets
Esempio n. 54
0
# Similar as LDA, need not assume same covariance between classes.
from sklearn.qda import QDA
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])

# Visualization
import matplotlib.pyplot as plt
plt.figure(1)
plt.scatter(X[y == 1, 0], X[y == 1, 1], color='g')
plt.scatter(X[y == 2, 0], X[y == 2, 1], color='b')
plt.title('X Data Set Visualization')

# Classification
clf = QDA()
clf = clf.fit(X, y)

print(clf.predict([[-0.8, -1]]))
plt.show()
h = .02  # step size in the mesh

names = [
    "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
    "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"
]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA()
]

X, y = make_classification(n_features=2,
                           n_redundant=0,
                           n_informative=2,
                           random_state=1,
                           n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [
    make_moons(noise=0.3, random_state=0),
    make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable
]
Esempio n. 56
0
    splot.set_yticks(())


def plot_lda_cov(lda, splot):
    plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')
    plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')


def plot_qda_cov(qda, splot):
    plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red')
    plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue')

###############################################################################
for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):
    # LDA
    lda = LDA(solver="svd", store_covariance=True)
    y_pred = lda.fit(X, y).predict(X)
    splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1)
    plot_lda_cov(lda, splot)
    plt.axis('tight')

    # QDA
    qda = QDA()
    y_pred = qda.fit(X, y, store_covariances=True).predict(X)
    splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2)
    plot_qda_cov(qda, splot)
    plt.axis('tight')
plt.suptitle('LDA vs QDA')
plt.show()
plt.savefig('image.png')
Esempio n. 57
0
def QuadDA(X_train, Y_train):
    qda = QDA()
    qda.fit(X_train, Y_train)
    return qda
def quadraticDiscriminantAnalysis(dataFile, outputFolder, regParam,parameters):
	inputData = yaml.load(open(dataFile))
	trainingSet = inputData['training']
	testingSet = inputData['testing']
	inputFile = inputData['inputFile']
	label = inputData['label']
	resultSet = []
	if not os.path.exists(outputFolder):
		try:
			os.makedirs(outputFolder)
		except OSError as exc:
		    if exc.errno != errno.EEXIST:
		        raise exc
		    pass
	for i in range(len(trainingSet)):
		"""testPredictions = []
		trainLabels = []
		trainFeatures = []
		trainDataSet = arff.load(trainingSet[i])
		for row in trainDataSet:
			content = list(row)
			trainFeatures.append(content[0:len(content)-1])
			trainLabels.append(content[len(content)-1])
		testFeatures = []
		testLabels = []
		testDataSet = arff.load(testingSet[i])
		for row in testDataSet:
			content = list(row)
			testFeatures.append(content[0:len(content)-1])
			testLabels.append(content[len(content)-1])"""
		train_df = pd.read_csv(trainingSet[i])
		train_labels = train_df[label]
		train_features = train_df.drop(label,axis=1)
		test_df = pd.read_csv(testingSet[i])
		test_predictions = pd.DataFrame(test_df[label])
		test_features = test_df.drop(label,axis=1)

		qda = QDA(reg_param=regParam)
		qda.fit(train_features, train_labels)
		test_predictions['predictions'] = qda.predict(test_features)
		#testPredictions = np.array(qda.predict(testFeatures)).tolist()
		resultFile = outputFolder + '/result' + str(i + 1) + '.csv'
		"""with open(resultFile,'w') as outfile:
			outfile.write('predictions:\n')
			outfile.write(yaml.dump(testPredictions, default_flow_style=False))
			outfile.write('true_labels:\n')
			outfile.write(yaml.dump(testLabels, default_flow_style=False))"""
		test_predictions.to_csv(resultFile,index=False)
		resultSet.append(resultFile)
	resultDict = dict()
	#parameters = dict()
	resultDict['results'] = resultSet
	resultDict['label'] = label
	#parameters['parameter.p'] = regParam
	if not parameters:
		parameters['parameter']='default'
	resultDict['algo_params'] = parameters
	resultDict['split_params'] = inputData['split_params']
	if 'feature_selection_parameters' in inputData:
        resultDict['feature_selection_parameters'] = inputData['feature_selection_parameters']
        resultDict['feature_selection_algorithm'] = inputData['feature_selection_algorithm']
    if 'feature_extraction_parameters' in inputData:
        resultDict['feature_extraction_parameters'] = inputData['feature_extraction_parameters']
        resultDict['feature_extraction_algorithm'] = inputData['feature_extraction_algorithm']
	if 'preprocessing_params' in inputData:
		resultDict['preprocessing_params'] = inputData['preprocessing_params']
	resultDict['inputFile'] = inputFile
	resultDict['algorithm'] = "QuadraticDiscriminantAnalysis"
	yaml.dump(resultDict, open(outputFolder + '/results.yaml', 'w'))

def main(args):
	inputFile = ''
	outputFolder = ''
	parameters=dict()
	regParam = 0.0 #float; regularizes the covariance estimate as [(1-reg_param)*Sigma + reg_param*np.eye(n_features)]
	try:
		opts,args = getopt.getopt(args, "i:o:p:", [])
	except getopt.GetoptError:
		print 'QuadraticDiscriminantAnalysis.py -i <inputFile> -o <outputFolder> -p <regParam>'
		sys.exit(2)
	for opt,arg in opts:
		if opt == '-i':
			inputFile = arg
		elif opt == '-o':
			outputFolder = arg
		elif opt == '-p':
			regParam = float(arg)
			parameters['parameter.p']=arg
	quadraticDiscriminantAnalysis(inputFile, outputFolder, regParam,parameters)
if __name__ == "__main__":
   main(sys.argv[1:])
Esempio n. 59
0
########################### Instantiate Classifiers ############################


classifiers = {
    "Logistic":LogisticRegression(),
    "NearestNeighbors":KNeighborsClassifier(100),
    "LinearSVM":SVC(kernel="linear", C=0.025),
    "RBFSVM":SVC(gamma=2, C=1),
    "DecisionTree":DecisionTreeClassifier(max_depth=32),
    "RandomForest":RandomForestClassifier(max_depth=None, n_estimators=200, max_features="auto",random_state=0,n_jobs=4),
    "RandomForest2":RandomForestClassifier(max_depth=8, n_estimators=200, max_features="auto",random_state=0,n_jobs=4),
    "AdaBoost":AdaBoostClassifier(n_estimators=500,random_state=0),
    "GradientBoost":GradientBoostingClassifier(n_estimators=500, learning_rate=1.0,max_depth=None, random_state=0),
    "NaiveBayes":GaussianNB(),
    "LDA":LDA(),
    "QDA":QDA()
    }

joblist=[
        (classifiers["RandomForest"],'RandomForest_signal','model_var_list_signal.csv'), # suffix and varlist
        #(classifiers["RandomForest"],'RandomForest_tmxpayer','model_var_list_tmxpayer.csv'),
        #(classifiers["RandomForest"],'RandomForest_tmxpayee','model_var_list_tmxpayee.csv'),
        #(classifiers["RandomForest"],'RandomForest_signal_tmxpayer','model_var_list_signal_tmxpayer.csv'),
        #(classifiers["RandomForest"],'RandomForest_signal_tmxpayee','model_var_list_signal_tmxpayee.csv'),
        #(classifiers["RandomForest"],'RandomForest_tmxpayer_tmxpayee','model_var_list_tmxpayer_tmxpayee.csv'),
        #(classifiers["RandomForest"],'RandomForest_tmxpayerpayee_comp','model_var_list_tmxpayerpayee_comp.csv'),
        #(classifiers["RandomForest"],'RandomForest_signal_tmxboth','model_var_list_signal_tmxboth.csv'),
        #(classifiers["RandomForest"],'RandomForest_signal_tmxboth_120','model_var_list_signal_tmxboth_120.csv'),
        #(classifiers["RandomForest"],'RandomForest_signal_tmxboth_800','model_var_list_signal_tmxboth_800.csv'),
        #(classifiers["RandomForest2"],'RandomForest_signal_tmxboth_RF2','model_var_list_signal_tmxboth.csv'),
        #(classifiers["RandomForest"],'RandomForest_signal_107','model_var_list_signal_107.csv'),
def QDA_onNonDynamicData():
    #Parsing Full training dataset
    XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt')
    YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt')

    #Parsing Full testing dataset
    XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt')
    YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt')

    #Getting the dataset associated with Non-Dynamic Activities on training 
    X_NonDynamic,Y_NonDynamic = common.getDataSubset(XFull,YFull.flatten(),[4,5,6])
    #Getting the dataset associated with Non-Dynamic Activities on testing
    X_NonDynamicTest,Y_NonDynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[4,5,6])

    #Fitting data using QDA classifier

    clf = QDA()
    clf.fit(X_NonDynamic, Y_NonDynamic.flatten())

    precision,recall,fscore = common.checkAccuracy(clf.predict(X_NonDynamicTest),Y_NonDynamicTest,[4,5,6])
    common.createConfusionMatrix(clf.predict(X_NonDynamicTest).flatten(),Y_NonDynamicTest.flatten(),[4,5,6])
    print(fscore)

    #Getting the dataset associated with Dynamic Activities on training 
    X_Dynamic,Y_Dynamic = common.getDataSubset(XFull,YFull.flatten(),[1,2,3])
    #Getting the dataset associated with Dynamic Activities on testing
    X_DynamicTest,Y_DynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[1,2,3])
    print(len(X_DynamicTest),len(Y_DynamicTest))

    #Fitting data using QDA classifier
    clf = QDA()
    clf.fit(X_Dynamic, Y_Dynamic.flatten())

    precision,recall,fscore = common.checkAccuracy(clf.predict(X_DynamicTest),Y_DynamicTest,[1,2,3])
    common.createConfusionMatrix(clf.predict(X_DynamicTest).flatten(),Y_DynamicTest.flatten(),[1,2,3])

    print(fscore)