Python loadDataの例

プログラミング言語: Python

名前空間/パッケージ名: mungetools

メソッド/関数: loadData

hotexamples.comのコード掲載数: 5

Python loadData - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのmungetools.loadDataの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

import numpy as np
import mungetools as mg
from sklearn.ensemble import RandomForestClassifier as rfc

# load data into pandas data frame
trdata, testdata = mg.loadData()

# get the id's for the test set
testid = np.array(testdata.UserID)

testdata = testdata.drop('UserID', axis=1)

# initialize classifier

depthlist = [3, 5, 10, 15, 20, 50, 100]

for i in depthlist:

    model = rfc(n_estimators=10,
                oob_score=True,
                max_features=None,
                max_depth=i)

    model = model.fit(trdata.iloc[:, 1:], trdata.iloc[:, 0])

    accur = model.oob_score_

    print('Out of Bag accuracy: %f \n' % accur)

# generate predictions
preds = np.array(model.predict_proba(testdata))[:, 1]

コード例 #2

ファイルを表示

ファイル: rfcmodel.py プロジェクト: kjford/Titanic

import numpy as np
import mungetools as mg
from sklearn.ensemble import RandomForestClassifier as rfc

'''
Use random forrest classifier to predict Titanic survivors
Uses training data in train.csv (found in data subfolder)
predicts from test.csv
writes out to .csv in predictions subfolder
As is, this gives ~77% accuracy on test set
This can hit ~79% with some tweaking (currently overfits)
'''

# load data into pandas data frame
trdata,testdata=mg.loadData()

# get the id's for the test set
testid = np.array(testdata.PassengerId)

# determine if each passenger has a known surviving family member
trdata,tesrdata=mg.addFamSurvivors(trdata,testdata)

# munge the data to generate one-hot labels for gender, titles, ticket departments
trdata=mg.mungeData(trdata)
testdata=mg.mungeData(testdata)


# initialize classifier

model= rfc(n_estimators=1000,oob_score=True,compute_importances=True)

コード例 #3

ファイルを表示

ファイル: feature_transformation_using_tree.py プロジェクト: LiuJundi/Undergraduate-dissertation

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
# from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline
import mungetools as mg

n_estimator = 10
# X, y = make_classification(n_samples=10)

# print X
# print y

trdata = mg.loadData()

X_train, X_test, y_train, y_test = trdata.iloc[:500, 1:], trdata.iloc[
    500:, 1:], trdata.iloc[:500, 0], trdata.iloc[500:, 0]
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = X_train.iloc[:250, ], X_train.iloc[
    250:, ], y_train.iloc[:250, ], y_train.iloc[250:, ]

print X_train
print y_train

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3,

コード例 #4

ファイルを表示

ファイル: svmmodel.py プロジェクト: wjtcute/German_credit_data_test

            scores[counter]=np.mean(scorei)
            paramholder[counter,0]=c
            paramholder[counter,1]=g
            if scorei>bestscore:
                bestscore=scorei
                bestmodel=model
            counter+=1
            print('Score = %f with c: %f, g: %f' %(scorei,c,g))
    bestc=paramholder[scores.argmax(),0]
    bestg=paramholder[scores.argmax(),1]
    print('Best score of %f with c: %f, g: %f' %(bestscore,bestc,bestg))
    return bestmodel

    

trdata=mg.loadData()

# testid = np.array(testdata.UserID)

# trdata = trdata.drop(['coursecount'],axis=1)

# testdata = testdata.drop(['UserID'],axis=1)

# print trdata
# print testdata

# initialize classifier
# try several values of c (prediction error weight) and g (kernel width)
testc = [0.05, 0.1, 0.3, 0.6, 1, 3, 5, 10 ]
testg = [0, 0.01, 0.05, 0.1, 0.5, 1, 1.5]

コード例 #5

ファイルを表示

ファイル: feature_importance_plot.py プロジェクト: LiuJundi/Undergraduate-dissertation

features_list = []


for line in file_data:
    if '@' in line and '{' in line:
        feature = line.split()[1]
        features_list.append(feature)

# print features_list

features_list = np.asarray(features_list)

print type(features_list)

input_df=mg.loadData()
X = input_df.values[:, 1:]
y = input_df.values[:, 0]
survived_weight = .75
y_weights = np.array([survived_weight if s == 0 else 1 for s in y])
 
print "Rough fitting a RandomForest to determine feature importance..."
forest = RandomForestClassifier(oob_score=True, n_estimators=10)
forest.fit(X, y, sample_weight=y_weights)
feature_importance = forest.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())

print feature_importance

fi_threshold = 30   
important_idx = np.where(feature_importance > fi_threshold)[0]