def random_forest(): iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 df['species'] = pd.Factor(iris.target, iris.target_names) pd.factorize df.head() train, test = df[df['is_train'] == True], df[df['is_train'] == False] features = df.columns[:4] clf = RandomForestClassifier(n_jobs=2) y, _ = pd.factorize(train['species']) clf.fit(train[features], y) preds = iris.target_names[clf.predict(test[features])] pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds'])
# __author__ = 'cjweffort' # -*- coding: utf-8 -*- from sklearn.datasets import load_iris from sklearn.ensemble import RandomForestClassifier import pandas as pd import numpy as np iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 df['species'] = pd.Factor(iris.target, iris.target_names) df.head() train, test = df[df['is_train'] == True], df[df['is_train'] == False] features = df.columns[:4] clf = RandomForestClassifier(n_jobs=2) y, _ = pd.factorize(train['species']) clf.fit(train[features], y) preds = iris.target_names[clf.predict(test[features])] pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds'])
from sklearn.datasets import load_iris import pandas as pd from pandasql import sqldf from pandasql import load_meat, load_births import re births = load_births() meat = load_meat() iris = load_iris() iris_df = pd.DataFrame(iris.data, columns=iris.feature_names) iris_df['species'] = pd.Factor(iris.target, levels=iris.target_names) iris_df.columns = [re.sub("[() ]", "", col) for col in iris_df.columns] print sqldf("select * from iris_df limit 10;", locals()) print sqldf("select sepalwidthcm, species from iris_df limit 10;", locals()) q = """ select species , avg(sepalwidthcm) , min(sepalwidthcm) , max(sepalwidthcm) from iris_df group by species; """ print "*" * 80 print "aggregation" print "-" * 80
import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier data = pd.read_excel("xiechengscore.xlsx") data = pd.DataFrame(data, columns=data.sheet_names) data['is_train'] = np.random.uniform(0, 1, len(data)) <= .75 data['species'] = pd.Factor(data.target, data.target_names) #最小-最大值规范化 环境评分、服务评分、设施评分、用户推荐比、用户评分、评价内容 #酒店等级 data['hotelClass'] = (data['hotelClass'] - data['hotelClass'].min()) / ( data['hotelClass'].max() - data['hotelClass'].min()) #酒店最低价 data['hotelLowestprice'] = ( data['hotelLowestprice'] - data['hotelLowestprice'].min()) / ( data['hotelLowestprice'].max() - data['hotelLowestprice'].min()) #酒店评论数 data['hotelComment'] = (data['hotelComment'] - data['hotelComment'].min()) / ( data['hotelComment'].max() - data['hotelComment'].min()) #用户推荐比 data['userRecommended'] = ( data['userRecommended'] - data['userRecommended'].min()) / ( data['userRecommended'].max() - data['userRecommended'].min()) #卫生分数 data['healthScore'] = (data['healthScore'] - data['healthScore'].min()) / ( data['healthScore'].max() - data['healthScore'].min()) #环境评分 data['surroundingsScore'] = ( data['surroundingsScore'] - data['surroundingsScore'].min()) / ( data['surroundingsScore'].max() - data['surroundingsScore'].min()) #服务评分
import matplotlib.pyplot as plt import scipy.stats as stats from sklearn import ensemble from credit_card_data import read_data from sklearn.utils import shuffle from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report df = read_data() state_names = df.state1.unique() state_names.sort() state_names = pd.Series(state_names) state_factor = pd.Factor(labels=state_names.index, levels=state_names.values, name="State") df["state_factor"] = df.state1.apply(lambda x: state_factor.levels.get_loc(x)) df = df.drop("state1", axis=1) domain_names = df.domain1.unique() domain_names.sort() domain_names = pd.Series(domain_names) domain_factor = pd.Factor(labels=domain_names.index, levels=domain_names.values, name="Domain") df["domain_factor"] = df.domain1.apply( lambda x: domain_factor.levels.get_loc(x)) df = df.drop("domain1", axis=1)
untrained = bigdata[bigdata['meta'] >= 15] print 'trained data' print trainedData[:5] #extract two columns from trainedData #convert to numpy array features = trainedData.ix[:, ['ratio', 'area']].as_matrix(['ratio', 'area']) test_features = untrained.ix[:, ['ratio', 'area']].as_matrix(['ratio', 'area']) print 'features' print features[:5] print 'features shape', features.shape print 'features type', type(features) ##label is a string:single, touching,nuclei,dust print 'labels convertion' lab1 = trainedData['type'] print 'lab1', type(lab1) f = pandas.Factor(lab1) print 'factor f', type(f) print 'labels', f.labels[:5] print 'labels type', type(f.labels) print 'labels shape', f.labels.shape # ##Classify with sklearn classifier = svm.SVC() model = classifier.fit(features, f.labels) predicted = classifier.predict(test_features) #match predicted /classified hiddenlab1 = untrained['type'] hiddf = pandas.Factor(hiddenlab1) match = (predicted == hiddf.labels) print "prediction"