コード例 #1
0
ファイル: main.py プロジェクト: rjgsousa/sentiment_analysis
def main(args):

    if args.analyse != None:
        train_data_x, test_data_x,train_data_y, test_data_y  = process_data(args.analyse)

        RT = RandomForestClassifier(n_estimators=100)
        RT.fit(train_data_x, train_data_y)
        print RT.score(test_data_x, test_data_y)

    return
コード例 #2
0
class Model(BaseModel):
    """Antares implementation of scikit learn random forest classifier

    """
    def __init__(self,
                 categorical_features=None,
                 n_estimators=50,
                 n_jobs=-1,
                 max_depth=10):
        '''
        Example:
            >>> from madmex.modeling.supervised.rf import Model
            >>> rf = Model()
            >>> # Write model to db
            >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no')
            >>> # Read model from db
            >>> rf2 = Model.from_db('test_model')
        '''
        super().__init__(categorical_features=categorical_features)
        self.model = RandomForestClassifier(n_estimators=n_estimators,
                                            n_jobs=n_jobs,
                                            max_depth=max_depth)
        self.model_name = 'rf'

    def fit(self, X, y):
        X = self.hot_encode_training(X)
        self.model.fit(X, y)

    def predict(self, X):
        '''
        Simply passes down the prediction from the underlying model.
        '''
        X = self.hot_encode_predict(X)
        return self.model.predict(X)

    def predict_confidence(self, X):
        """Get confidence of every prediction
        """
        X = self.hot_encode_predict(X)
        return self.model.predict_proba(X).max(axis=1)

    def score(self, X, y):
        '''
        Test the model given a dataset and a target vector.

        This method applies the model that this object represents to the given dataset using
        the response variable y. It is a measure of the accuracy of the trained model. Usually
        the orginal dataset should be splitted in training and testing subsets to cross validate
        the model.
        '''
        return self.model.score(X, y)
コード例 #3
0
def decision_frist():

    data = datasets.load_iris()
    x = data["data"]
    y = data["target"]

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    des = DecisionTreeClassifier(max_leaf_nodes=3)
    des.fit(X_train, y_train)
    print(des.predict(X_test))
    print(des.score(X_test, y_test))

    rom = RandomForestClassifier()
    rom.fit(X_train, y_train)
    print(rom.predict(X_test))
    print(rom.score(X_test, y_test))
コード例 #4
0
    def getSentiment(self, path):
        dirs = os.listdir(path)
        for file in dirs:
            filename = path + file
            print(filename)

            df = pd.read_csv(filename, header=None)
            data_original = df.as_matrix()
            data = data_original[:, 0:len(data_original[0]) - 1]
            score = data_original[:, len(data_original[0]) - 1]

            vectorizer = TfidfVectorizer(min_df=5,
                                         max_df=0.8,
                                         sublinear_tf=True,
                                         use_idf=True,
                                         decode_error='ignore')

            train_data = []
            train_labels = []
            test_data = []
            test_labels = []

            totalLen = len(data)
            train = int(totalLen * 0.7)
            test = totalLen - train

            train_data = data[0:train, :]
            train_labels = score[0:train]
            print(len(train_data))
            print(len(train_labels))

            test_data = data[train + 1:totalLen, :]
            test_labels = score[train + 1:totalLen]
            print(len(test_data))
            print(len(test_labels))

            classifier_rbf = svm.SVC()
            x = list()
            x = train_data[:]
            classifier_rbf.fit(x, train_labels)
            accuracy = classifier_rbf.score(test_data, test_labels)
            print(accuracy)

            classifier_rmf = RandomForestClassifier(n_estimators=1500)
            classifier_rmf = classifier_rmf.fit(x, train_labels)
            accuracy = classifier_rmf.score(test_data, test_labels)
            print(accuracy)
 def rfc(self):
     N = [10, 50, 100, 1000, 10**4]
     self.bestScoreN = 0
     bestScore = 0
     for arg in N:
         score = 0
         for train_ind, test_ind in self.skf.split(self.X,self.Y):
             self.X_train = [self.X[ind] for ind in train_ind]
             self.Y_train = [self.Y[ind] for ind in train_ind]
             self.X_test = [self.X[ind] for ind in test_ind]
             self.Y_test = [self.Y[ind] for ind in test_ind]
             clf = RandomForestClassifier(n_estimators = arg).fit(self.X_train,self.Y_train)
             score = score + clf.score(self.X_test, self.Y_test )
         if score > bestScore :
             bestScore = score
             self.bestScoreN = arg
         print ("Mean score for Paramter(",arg,")",(score/float(5))*100,"%")
コード例 #6
0
def models(type='logreg', X=None, y=None, Xtest=None, ytest=None):

    # Logistic Regression -----
    if type == 'logreg':
        logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
        logreg.fit(X, y)
        score_logit = logreg.score(Xtest, ytest)
        return(score_logit)

    # CART -----
    if type == 'cart':
        cart_tree = tree.DecisionTreeClassifier(random_state=100)
        cart_tree.fit(X, y)
        score_cart = cart_tree.score(Xtest, ytest)
        return(score_cart)

    # Random Forest -----
    if type =='rf':
        forest = RandomForestClassifier(n_estimators = 100, max_features='auto', random_state=100)
        forest.fit(X,y)
        score_forest = forest.score(Xtest, ytest)
        return(score_forest)

    if type =='xgboost':
        xgbooster = XGBClassifier(n_estimators=100, max_depth=4,random_state=100)
        xgbooster.fit(X, y)
        score_xgboost = xgbooster.score(Xtest, ytest)
        return(score_xgboost)

    if type =='nn':
        nnet = MLPClassifier(solver='adam',
                                hidden_layer_sizes=(5,5),
                                max_iter = 500,
                                early_stopping = True,
                                random_state=100)

        nnet.fit(X, y)
        score_nnet = nnet.score(Xtest, ytest)
        return(score_nnet)
コード例 #7
0
ファイル: rf.py プロジェクト: makeling/antares
class Model(BaseModel):
    '''
    classdocs
    '''

    def __init__(self, path):
        '''
        Constructor
        '''
        self.path = path
        self.model = RandomForestClassifier(n_estimators=150,n_jobs=8)
        self.model_name = 'rf'

    def fit(self, X, y):
        self.model.fit(X,y)

    def predict(self, X):
        '''
        Simply passes down the prediction from the underlying model.
        '''
        return self.model.predict(X)

    def save(self, filepath):
        '''
        Persists the trained model to a file.
        '''
        joblib.dump(self.model, create_filename(filepath,'%s.pkl' % self.model_name)) 

    def load(self, filepath):
        '''
        Loads an already train model from a file to perform predictions.
        '''
        self.model = joblib.load(create_filename(filepath,'%s.pkl' % self.model_name))

    def score(self, X, y):
        '''
        Lets the user load a previously trained model to predict with it. 
        '''
        return self.model.score(X,y)
コード例 #8
0
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(iris.data, iris.target)
knn.predict(iris.data)

len(iris.target)
sum(iris.target == knn.predict(iris.data))
knn.score(iris.data, iris.target)
help(cross_val_predict)
cross_val_predict(knn, iris.data, iris.target, cv=20)
cross_val_score(knn, iris.data, iris.target, cv=20).mean()


rf = RandomForestClassifier(n_estimators=3)
rf.fit(iris.data, iris.target)
rf.predict_proba(iris.data)
rf.score(iris.data, iris.target)
sum(iris.target == rf.predict(iris.data))
cross_val_score(rf, iris.data, iris.target, cv=20).mean()


from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
'''
https://github.com/drivendataorg/box-plots-sklearn/blob/master/src/data/multilabel.py
'''
mcr = OneVsRestClassifier(LogisticRegression())
mcr.fit(iris.data, iris.target)
mcr.predict(iris.data)
mcr.predict_proba(iris.data)

holdout = test  # from now on we will refer to this
# dataframe as the holdout data

all_X = train[columns]
all_y = train['Survived']

train_X, test_X, train_y, test_y = train_test_split(all_X,
                                                    all_y,
                                                    test_size=0.20,
                                                    random_state=0)

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(train_X, train_y)
Y_pred = random_forest.predict(test_X)
random_forest.score(train_X, train_y)
acc_random_forest = round(random_forest.score(train_X, train_y) * 100, 2)
print('train accuracy' + str(acc_random_forest))

scores = cross_val_score(random_forest, all_X, all_y, cv=10)
accuracy = np.mean(scores)
print('Cross_val scores : ' + str(scores))
print('Cross_val accuracy : ' + str(accuracy))

random_forest.fit(all_X, all_y)
holdout_predictions = random_forest.predict(holdout[columns])

holdout_ids = holdout["PassengerId"]
submission_df = {"PassengerId": holdout_ids, "Survived": holdout_predictions}
submission = pd.DataFrame(submission_df)
submission.to_csv("submission.csv", index=False)
コード例 #10
0
sub1 = pd.read_csv(
    '/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/kaggle/everyone_dies/submission_2018-07-13 09:14:21.csv')
sub2 = pd.read_csv(
    '/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/kaggle/everyone_dies/submission_2018-07-11 11:06:07.csv')
sub3 = pd.read_csv(
    '/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/kaggle/everyone_dies/submission_2018-07-13 09:17:57.csv')

sub_all = sub3.join(sub1, lsuffix='_1', rsuffix='_2')
pd.crosstab(sub_all['Survived_1'], sub_all['Survived_2'])
cv_model.cv_results_

### ASSESS BEST PARAMS TREE AND SCORE
tree_model = RandomForestClassifier(random_state=297,
                                    **cv_model.best_params_)  ####ONLY IF THE PREVIOUS MODEL IS A SearchCV
tree_model = tree_model.fit(trainX, trainY.values.ravel())
tree_model.score(trainX, trainY)
tree_model.score(testX, testY)

### CHECK IMPORTANCE OF FEATURES
feature_importance = pd.DataFrame(tree_model.feature_importances_, index=trainX.columns, columns=['Imp']).reset_index()
feature_importance['pk'] = 1
plot_scatter(feature_importance, 'index', 'Imp', 'index')
plot_bar(feature_importance, 'index', 'Imp', 'index')

### PREDICT
prediction = tree_model.predict(features_all)
tree_model.predict_proba(features_all)

#### VISUALIZE TREE
### ONLY FOR SIMPLE DECISION TREE
# tree.export_graphviz(tree_model,
コード例 #11
0
    if len(sys.argv) > 1 and sys.argv[1] == '--skip-train':
        results = pd.read_csv("./data/results.csv")
    else:
        for classification_dataset in classification_dataset_names:
            print("Starting", classification_dataset)

            X, y = fetch_data(classification_dataset, return_X_y=True, local_cache_dir='./data/')
            train_X, test_X, train_y, test_y = train_test_split(X, y)

            rf = RandomForestClassifier()
            lexRF = LexicaseForestClassifier()

            rf.fit(train_X, train_y)
            lexRF.fit(train_X, train_y)

            rf_score = rf.score(test_X, test_y)
            lexRF_score = lexRF.score(test_X, test_y)

            results['problem'] = results['problem'] + ([classification_dataset] * 2)
            results['method'] = results['method'] + ['RF', 'LexRF']
            results['score'].append(rf_score)
            results['score'].append(lexRF_score)

        results = pd.DataFrame(results)
        results.to_csv("./data/results.csv", index=False)

    problems = (
        results
        .groupby("problem")
        .apply(lambda x: x.score.max() - x.score.min())
        .where(lambda x: x > 0.05)
コード例 #12
0
#Check that the algorithm used gives good accuracy by using part of the training set to validate
train_train, train_test = train_test_split(train, test_size=0.3)

#Train model
model = RandomForestClassifier(n_estimators=100,
                               oob_score=True,
                               random_state=10,
                               max_features="auto",
                               min_samples_leaf=20)
#model=KNeighborsClassifier(n_neighbors=6)

#if getting this error, it is because a matrix with 1 column
#is being passed in when a 1d array is expected. ravel() will work.
#DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main':
#To resolve this error, convert label values to int or str as float is not a valid label-type
#raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array
#model.fit(train_train.ix[:,'pixel0':'pixel783'], np.asarray(train_train.ix[:,'label'].astype(int)))
#print "model.score:", model.score(train_test.ix[:,'pixel0'0:'pixel783'], np.asarray(train_test.ix[:,'label'].astype(int)))
#print "cross validation score:", cross_validation.cross_val_score(model, train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'], cv=3)
model.fit(train_train.loc[:, 'pixel0':'pixel783'],
          train_train.loc[:, 'label'].values.ravel())
print(
    "model.score",
    model.score(train_test.loc[:, 'pixel0':'pixel783'],
                train_test.loc[:, 'label'].values.ravel()))

#Predict output
#predicted=model.predict(train_test.ix[:,'pixel0':'pixel783'])
#print predicted
#print "Accuracy: ", accuracy_score(train_test.ix[:,'label'].astype(int), predicted)
コード例 #13
0
import matplotlib.pyplot as plt
import numpy as np
import pylab as pl

features_train, labels_train, features_test, labels_test = makeTerrainData()



########################## DECISION TREE #################################


### your code goes here--now create 2 decision tree classifiers,
### one with min_samples_split=2 and one with min_samples_split=50
### compute the accuracies on the testing data and store
### the accuracy numbers to acc_min_samples_split_2 and
### acc_min_samples_split_50, respectively

clf2 = RandomForestClassifier(min_samples_split=2)
clf2.fit(features_train, labels_train)

clf50 = RandomForestClassifier(min_samples_split=50)
clf50.fit(features_train, labels_train)


acc_min_samples_split_2 = clf2.score(features_test, labels_test)
acc_min_samples_split_50 = clf50.score(features_test, labels_test)

def submitAccuracies():
  return {"acc_min_samples_split_2":round(acc_min_samples_split_2,3),
          "acc_min_samples_split_50":round(acc_min_samples_split_50,3)}
コード例 #14
0
#Check if there is linear correlation between pixel<x> columns and label
#If yes, we should dive into the columns with correlation. Linear / logistic regression may work well with the data.
#In this case, makes sense that there is no correlation - higher pixel values does not mean that label value will be higher
#print "Correlation:", train.corr()["label"]

#Check that the algorithm used gives good accuracy by using part of the training set to validate
train_train, train_test=train_test_split(train, test_size=0.3)

#Train model
model=RandomForestClassifier(n_estimators = 100, oob_score = True, random_state =10, max_features = "auto", min_samples_leaf = 20)
#model=KNeighborsClassifier(n_neighbors=6)


#if getting this error, it is because a matrix with 1 column
#is being passed in when a 1d array is expected. ravel() will work.
#DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main':
#To resolve this error, convert label values to int or str as float is not a valid label-type
#raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array
#model.fit(train_train.ix[:,'pixel0':'pixel783'], np.asarray(train_train.ix[:,'label'].astype(int)))
#print "model.score:", model.score(train_test.ix[:,'pixel0':'pixel783'], np.asarray(train_test.ix[:,'label'].astype(int)))
#print "cross validation score:", cross_validation.cross_val_score(model, train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'], cv=3)
model.fit(train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'].values.ravel())
print "model.score", model.score(train_test.ix[:,'pixel0':'pixel783'], train_test.ix[:,'label'].values.ravel())


#Predict output
#predicted=model.predict(train_test.ix[:,'pixel0':'pixel783'])
#print predicted
#print "Accuracy: ", accuracy_score(train_test.ix[:,'label'].astype(int), predicted)
コード例 #15
0
dataset.columns = feature_names

#split into train and test set
train, test = train_test_split(dataset, test_size=0.3)

#normalize data
df_scaled_train = pd.DataFrame(preprocessing.scale(train),
                               columns=feature_names)
df_scaled_test = pd.DataFrame(preprocessing.scale(test), columns=feature_names)

model = RandomForestClassifier(n_estimators=100,
                               oob_score=True,
                               random_state=10,
                               max_features="auto",
                               min_samples_leaf=20)

#train model
#if getting this error, it is because a matrix with 1 column
#is being passed in when a 1d array is expected. ravel() will work.
#DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main':
#To resolve this error, convert label values to int or str as float is not a valid label-type
#raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array
model.fit(df_scaled_train.ix[:, 'times pregnant':'age (years)'],
          np.asarray(df_scaled_train.ix[:, 'target'].astype(int)))
print "Accuracy:", model.score(
    df_scaled_test.ix[:, 'times pregnant':'age (years)'],
    np.asarray(df_scaled_test.ix[:, 'target'].astype(int)))

#predict output
predicted = model.predict(df_scaled_test.ix[:, 'times pregnant':'age (years)'])
print predicted
コード例 #16
0
import dataProcessing
import joblib
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.model_selection import train_test_split
data = dataProcessing.readJson("activity1.json") + dataProcessing.readJson(
    "activity2.json") + dataProcessing.readJson(
        "activity3.json") + dataProcessing.readJson("activity4.json")

Data = dataProcessing.generateDataSet(data)

X_train, X_test, y_train, y_test = train_test_split(Data,
                                                    dataProcessing.target,
                                                    test_size=0.3)
rfc = RandomForestClassifier()
rfc = rfc.fit(X_train, y_train)
result = rfc.score(X_test, y_test)

print(result)
print(rfc.predict(X_test))
joblib.dump(rfc, "rfcModel.pkl")
コード例 #17
0
#download the file
raw_data=urllib.urlopen(url)

#get data, add column names and index
feature_names=["times pregnant", "plasma glucose conc.", "distolic blood pressure (mm Hg)", "triceps skin fold thickness (mm)", "2-hour serum insulin (mu U/ml)", "body mass index (kg/m^2)", "diabetes pedigree function", "age (years)", "target"]
dataset=pd.DataFrame.from_csv(raw_data)
dataset=dataset.reset_index()
dataset.columns=feature_names

#split into train and test set
train, test=train_test_split(dataset, test_size=0.3)

#normalize data
df_scaled_train=pd.DataFrame(preprocessing.scale(train), columns=feature_names)
df_scaled_test=pd.DataFrame(preprocessing.scale(test), columns=feature_names)

model=RandomForestClassifier(n_estimators = 100, oob_score = True, random_state =10, max_features = "auto", min_samples_leaf = 20)

#train model
#if getting this error, it is because a matrix with 1 column
#is being passed in when a 1d array is expected. ravel() will work.
#DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main':
#To resolve this error, convert label values to int or str as float is not a valid label-type
#raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array
model.fit(df_scaled_train.ix[:,'times pregnant':'age (years)'], np.asarray(df_scaled_train.ix[:,'target'].astype(int)))
print "Accuracy:", model.score(df_scaled_test.ix[:,'times pregnant':'age (years)'], np.asarray(df_scaled_test.ix[:,'target'].astype(int)))

#predict output
predicted=model.predict(df_scaled_test.ix[:,'times pregnant':'age (years)'])
print predicted
コード例 #18
0
    else:
        for classification_dataset in classification_dataset_names:
            print("Starting", classification_dataset)

            X, y = fetch_data(classification_dataset,
                              return_X_y=True,
                              local_cache_dir='./data/')
            train_X, test_X, train_y, test_y = train_test_split(X, y)

            rf = RandomForestClassifier()
            lexRF = LexicaseForestClassifier()

            rf.fit(train_X, train_y)
            lexRF.fit(train_X, train_y)

            rf_score = rf.score(test_X, test_y)
            lexRF_score = lexRF.score(test_X, test_y)

            results['problem'] = results['problem'] + (
                [classification_dataset] * 2)
            results['method'] = results['method'] + ['RF', 'LexRF']
            results['score'].append(rf_score)
            results['score'].append(lexRF_score)

        results = pd.DataFrame(results)
        results.to_csv("./data/results.csv", index=False)

    problems = (results.groupby("problem").apply(lambda x: x.score.max(
    ) - x.score.min()).where(lambda x: x > 0.05).dropna().index.values)
    viz_data = results[[x in problems for x in results.problem]]
コード例 #19
0
from prep_terrain_data import makeTerrainData
from sklearn.ensemble.forest import RandomForestClassifier
import numpy as np
import pylab as pl

features_train, labels_train, features_test, labels_test = makeTerrainData()



#################################################################################


########################## DECISION TREE #################################



#### your code goes here
clf = RandomForestClassifier()
clf.fit(features_train, labels_train)


acc = clf.score(features_test, labels_test)
### be sure to compute the accuracy on the test set


    
def submitAccuracies():
  return {"acc":round(acc,3)}