コード例 #1
0
def random_forest(train_vec,train_label, n_estimators = 10, min_samples_split = 2, min_samples_leaf = 1, criterion = "entropy"):
	model = RandomForestClassifier(n_estimators = 15, min_samples_split = 2, min_samples_leaf = 2,criterion = "gini")
	model.fit_transform(train_vec,train_label)
	
	print 'Random Forest Classification Accu: ' + str(model.score(train_vec,train_label))
	
	return model  
コード例 #2
0
def predict_on_test_set(label_group):
  sample = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_train.csv"))
  test = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_test.csv"))

  preprocessed = Preprocess(sample, which_labels = label_group)

  rf = RandomForestClassifier(n_estimators = 80, criterion = "entropy", bootstrap = True, max_features = 'sqrt', max_depth = 40)
  rf.fit_transform(X = preprocessed.features, y = preprocessed.labels.values.ravel())
  test_preprocessed = Preprocess(test, which_labels = label_group)
  predicted_labels = rf.predict(test_preprocessed.features)
  error_rate, _ = benchmark(predicted_labels.ravel(), test_preprocessed.labels.values)

  plot_feature_importances(preprocessed.features.columns.values, rf.feature_importances_, label_group)
コード例 #3
0
ファイル: baseline.py プロジェクト: arvs/carlton
class RandomForestModel(FreshnessModel):
  def __init__(self, trainfile, testfile, extra_features_file = None):
    super(RandomForestModel, self).__init__(trainfile, testfile, extra_features_file)
    self.clf = RandomForestClassifier(n_estimators=10, max_depth=None)

  def train(self, data = None, target = None):
    if data is None:
      data = self.data
    if target is None:
      target = self.target
    self.clf.fit_transform(data, target)

  def pred(self, X):
    return self.clf.predict(X)
コード例 #4
0
def rf_selection(x, y, n, max_depth, k):
    '''
    :param x: features
    :param y: target
    :param n: estimators of random forests
    :param max_depth: max_depth of random forests
    :param k: numbers of features to retain
    :return: results feture names
    '''
    rf = RandomForestClassifier(n_estimators=n, max_depth=max_depth)
    rf.fit_transform(x.values, y.values)
    features_name = list(x.columns)
    importance = list(rf.feature_importances_)
    df = pd.DataFrame({'feature': features_name, 'importance': importance})
    df.sort_values(by='importance', ascending=False, inplace=True)
    return list(df['feature'].iloc[:k])
コード例 #5
0
def main():
    #Loading the training set and test set
    path1 = "C:\Python32\A2PW1.csv"
    path2 = "C:\Python32\A2PW3.csv"
    train = read_csv(path1, has_header = True)
    target = [x[0] for x in train]
    train = [x[1:] for x in train]
    test = read_csv(path2, has_header = True)
    test = [x[1:] for x in test]
    print('The training set is:')
    print(train)
    print('The test set is:')
    print(test)

    #create the model
    rf = RandomForestClassifier(n_estimators = 100)
    #throw the data into model
    rf.fit(train, target)
    predicted_probs = rf.predict_log_proba(test)
    print(predicted_probs)
    output_file_path = "C:\Python32\pythontoday.txt"
    numpy.savetxt(output_file_path, predicted_probs,delimiter=',',fmt='%1.4e')

    newArr = rf.fit_transform(test,target)
    print('newArr becomes: ',newArr)
コード例 #6
0
def cross_validate_number_of_trees(label_group):

  sample = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_train.csv"))
  test = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_test.csv"))

  preprocessed = Preprocess(sample, which_labels = label_group)

  n_trees = (5, 10, 30, 60, 80)
  oob_scores = []
  for n_tree in n_trees:
    rf = RandomForestClassifier(n_estimators = n_tree, criterion = "entropy", oob_score = True, bootstrap = True, max_features = 'sqrt', max_depth = 40)
    rf.fit_transform(X = preprocessed.features, y = preprocessed.labels.values.ravel())

    score = 1.0 - rf.oob_score_
    
    oob_scores.append(score)
    print "Out-of-Bag Error for Number of Trees %s: %s" % (n_tree, score)

  plot_oob_error_n_tress(n_trees, oob_scores, label_group)
コード例 #7
0
def RFClassify(trainData, trainLabel, testData):
    rfClf = RandomForestClassifier(n_estimators=10,
                                   criterion='gini',
                                   max_depth=None,
                                   min_samples_split=2,
                                   min_samples_leaf=1,
                                   min_weight_fraction_leaf=0.0,
                                   max_features='auto',
                                   max_leaf_nodes=None,
                                   bootstrap=True,
                                   oob_score=False,
                                   n_jobs=1,
                                   random_state=None,
                                   verbose=0,
                                   warm_start=False,
                                   class_weight=None)
    rfClf.fit_transform(trainData, trainLabel)
    testlabel = rfClf.predict(testData)
    return testlabel
コード例 #8
0
def cross_validate_depth(label_group):

  sample = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_train.csv"))
  test = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_test.csv"))

  preprocessed = Preprocess(sample, which_labels = label_group)

  depths = (2, 40, 60, 80)
  oob_scores = []
  for depth in depths:
    rf = RandomForestClassifier(n_estimators = 80, criterion = "entropy", oob_score = True, bootstrap = True, max_features = 'sqrt', max_depth = depth)
    rf.fit_transform(X = preprocessed.features, y = preprocessed.labels.values.ravel())

    score = 1.0 - rf.oob_score_
    
    oob_scores.append(score)
    print "Out-of-Bag Error for Depth %s: %s" % (depth, score)

  plot_oob_error_depth(depths, oob_scores, label_group)
コード例 #9
0
ファイル: RFC.py プロジェクト: SVAI-Dream/dreamchallenge
def rfc_with_smote(X, ycl, n_estimators=32, min_samples_leaf=5, max_depth=3):
    try:
        sm = SMOTETomek(k=5) # k is number of nearest neighbour
        X_smt, y_smt = sm.fit_sample(X, ycl) # returns re-sampled matrix and re-sampled label vector
    except:
        pass
    try:
        sm = SMOTETomek(k=2)
        X_smt, y_smt = sm.fit_sample(X, ycl)
    except:
        X_smt, y_smt = X, ycl

    X_train, X_test, y_train, y_test = train_test_split(X_smt, y_smt, test_size=0.20, random_state=42)
    ycl_train = (y_train > 0).astype(int)
    ycl_test = (y_test > 0).astype(int)

    sfm = SelectFromModel(RandomForestClassifier(
        n_estimators = n_estimators,
        min_samples_leaf = min_samples_leaf,
        max_depth = max_depth
    )) # select features

    sfm.fit(X_train, ycl_train) # fit the model using training set
    X_train = sfm.transform(X_train) # reduce the matrix into selected features
    X_test = sfm.transform(X_test)

    rfc = RandomForestClassifier(
        n_estimators = n_estimators,
        min_samples_leaf = min_samples_leaf,
        max_depth = max_depth
    )

    rfc.fit_transform(X_train, ycl_train)
    precision, recall, f1, support = evaluate_model(rfc, X_test, ycl_test, threshold=0.5) # return some performance statistics of the model

    return rfc, precision, recall, f1, support
コード例 #10
0
class RandomForest(predictor.IBayesDBForeignPredictor):
    """A Random Forest foreign predictor.

    The `targets` must be a single categorical stattype.  The `conditions`
    may be arbitrary numerical or categorical columns.
    """
    @classmethod
    def create(cls, bdb, table, targets, conditions):
        cols = [c for c, _ in targets + conditions]
        df = bdbcontrib.bql_utils.table_to_df(bdb, table, cols)
        rf = cls()
        rf.train(df, targets, conditions)
        rf.prng = bdb.np_prng
        return rf

    @classmethod
    def serialize(cls, _bdb, pred):
        state = {
            'targets': pred.targets,
            'conditions_numerical': pred.conditions_numerical,
            'conditions_categorical': pred.conditions_categorical,
            'rf_full': pred.rf_full,
            'rf_partial': pred.rf_partial,
            'categories_to_val_map': pred.categories_to_val_map
        }
        return pickle.dumps(state)

    @classmethod
    def deserialize(cls, bdb, binary):
        state = pickle.loads(binary)
        rf = cls(targets=state['targets'],
                 conditions_numerical=state['conditions_numerical'],
                 conditions_categorical=state['conditions_categorical'],
                 rf_full=state['rf_full'],
                 rf_partial=state['rf_partial'],
                 categories_to_val_map=state['categories_to_val_map'])
        rf.prng = bdb.np_prng
        return rf

    @classmethod
    def name(cls):
        return 'random_forest'

    def __init__(self,
                 targets=None,
                 conditions_numerical=None,
                 conditions_categorical=None,
                 rf_full=None,
                 rf_partial=None,
                 categories_to_val_map=None):
        self.targets = targets
        self.conditions_numerical = conditions_numerical
        self.conditions_categorical = conditions_categorical
        if (conditions_numerical is not None
                and conditions_categorical is not None):
            self.conditions = self.conditions_numerical + \
                self.conditions_categorical
        self.rf_full = rf_full
        self.rf_partial = rf_partial
        self.categories_to_val_map = categories_to_val_map

    def train(self, df, targets, conditions):
        # Obtain the targets column.
        if len(targets) != 1:
            raise BLE(
                ValueError('RandomForest requires exactly one column in '
                           'targets. Received {}'.format(targets)))
        if targets[0][1].lower() != 'categorical':
            raise BLE(
                ValueError('RandomForest can only classify CATEGORICAL '
                           'columns. Received {}'.format(targets)))
        self.targets = [targets[0][0]]
        # Obtain the condition columns.
        if len(conditions) < 1:
            raise BLE(
                ValueError('RandomForest requires at least one column in '
                           'conditions. Received {}'.format(conditions)))
        self.conditions_categorical = []
        self.conditions_numerical = []
        for c in conditions:
            if c[1].lower() == 'categorical':
                self.conditions_categorical.append(c[0])
            else:
                self.conditions_numerical.append(c[0])
        self.conditions = self.conditions_numerical + \
            self.conditions_categorical
        # The dataset.
        self.dataset = pd.DataFrame()
        # Lookup for categoricals to code.
        self.categories_to_val_map = dict()
        # Training set (regressors and labels)
        self.X_numerical = np.ndarray(0)
        self.X_categorical = np.ndarray(0)
        self.Y = np.ndarray(0)
        # Random Forests.
        self.rf_partial = RandomForestClassifier(n_estimators=100)
        self.rf_full = RandomForestClassifier(n_estimators=100)
        # Preprocess the data.
        self.dataset = utils.extract_sklearn_dataset(self.conditions,
                                                     self.targets, df)
        self.categories_to_val_map = utils.build_categorical_to_value_map(
            self.conditions_categorical, self.dataset)
        self.X_categorical = utils.extract_sklearn_features_categorical(
            self.conditions_categorical, self.categories_to_val_map,
            self.dataset)
        self.X_numerical = utils.extract_sklearn_features_numerical(
            self.conditions_numerical, self.dataset)
        self.Y = utils.extract_sklearn_univariate_target(
            self.targets, self.dataset)
        # Train the random forest.
        self._train_rf()

    def _train_rf(self):
        """Trains the random forests classifiers.

        We train two classifiers, `partial` which is just trained on
        `conditions_numerical`, and `full` which is trained on
        `conditions_numerical+conditions_categorical`.

        This safe-guard feature is critical for querying; otherwise sklearn
        would crash whenever a categorical value unseen in training due to
        filtering (but existant in df nevertheless) was passed in.
        """
        # pylint: disable=no-member
        self.rf_partial.fit_transform(self.X_numerical, self.Y)
        self.rf_full.fit_transform(
            np.hstack((self.X_numerical, self.X_categorical)), self.Y)

    def _compute_targets_distribution(self, conditions):
        """Given conditions dict {feature_col:val}, returns the
        distribution and (class mapping for lookup) of the random label
        self.targets|conditions.
        """
        if not set(self.conditions).issubset(set(conditions.keys())):
            raise BLE(
                ValueError('Must specify values for all the conditionals.\n'
                           'Received: {}\n'
                           'Expected: {}'.format(
                               conditions, self.conditions_numerical +
                               self.conditions_categorical)))

        # Are there any category values in conditions which never appeared during
        # training? If yes, we need to run the partial RF.
        unseen = any([
            conditions[cat] not in self.categories_to_val_map[cat]
            for cat in self.conditions_categorical
        ])

        X_numerical = [conditions[col] for col in self.conditions_numerical]
        if unseen:
            distribution = self.rf_partial.predict_proba(X_numerical)
            classes = self.rf_partial.classes_
        else:
            X_categorical = [
                conditions[col] for col in self.conditions_categorical
            ]
            X_categorical = utils.binarize_categorical_row(
                self.conditions_categorical, self.categories_to_val_map,
                X_categorical)
            distribution = self.rf_full.predict_proba(
                np.hstack((X_numerical, X_categorical)))
            classes = self.rf_partial.classes_
        return distribution[0], classes

    def simulate(self, n_samples, conditions):
        distribution, classes = self._compute_targets_distribution(conditions)
        draws = self.prng.multinomial(1, distribution, size=n_samples)
        return [classes[np.where(d == 1)[0][0]] for d in draws]

    def logpdf(self, value, conditions):
        distribution, classes = self._compute_targets_distribution(conditions)
        if value not in classes:
            return -float('inf')
        return np.log(distribution[np.where(classes == value)[0][0]])
コード例 #11
0
ファイル: SVM&RF.py プロジェクト: doriszyj/SVM-RF
def RFClassify(trainData,trainLabel,testData):
    rfClf=RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)
    rfClf.fit_transform(trainData, trainLabel)
    testlabel=rfClf.predict(testData)
    return testlabel
コード例 #12
0
ファイル: classificator.py プロジェクト: junk2112/detector
def make_test(train_source, test_source, light_type=None, validation=False, v_size=0.5, estimators=85):
	train = read_csv(train_source)
	tmp = open(train_source)
	feature_count = None
	for line in tmp:
		feature_count = len(line.split(","))
		break

	trainX = np.asarray(train[range(1, feature_count)])
	trainY = np.asarray(train[[0]]).ravel()
	# print "All Data size: " + str(len(trainX))
	testX = None
	testY = None

	if validation:
		# --- CROSS VALIDATION ---
		trainX, testX, trainY, testY = cross_validation.train_test_split(
			trainX, trainY, test_size=v_size, random_state=0)
	else:
		# --- TEST DATA ---
		test = read_csv(test_source)
		testX = np.asarray(test[range(1, feature_count)])
		testY = np.asarray(test[[0]]).ravel()
	if len(testX) < 100:
		return 0
	print "Train size: " + str(len(trainX))
	print "Test size: " + str(len(testX))

	# --- KNN ---
	# clf = KNeighborsClassifier(metric='minkowski', n_neighbors=1, p=2)

	# --- SVM ---
	# clf = svm.SVC()
	# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
	# gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None,
	# shrinking=True, tol=0.001, verbose=False)

	# --- Random Forest ---
	clf = RandomForestClassifier(n_estimators=estimators)


	clf.fit_transform(trainX, trainY)

	true_false = 0
	true_true = 0
	false_true = 0
	false_false = 0
	true = 0
	false = 0
	for i in range(len(testY)):
		answer = clf.predict(testX[i])
		if testY[i] == True:
			true += 1
		else:
			false += 1
		# print str(answer[0]) + " " + str(testY[i])
		if answer[0] == True and testY[i] == False:
			true_false += 1
		if answer[0] == True and testY[i] == True:
			true_true += 1
		if answer[0] == False and testY[i] == False:
			false_false += 1
		if answer[0] == False and testY[i] == True:
			false_true += 1
	if validation:
		if true > 0:
			print light_type + " true_true (precision): " + str(float(true_true)/float(true))
			print light_type + " false_true: " + str(float(false_true)/float(true))
		if false > 0:
			print light_type + " true_false: " + str(float(true_false)/float(false))
			print light_type + " false_false (precision): " + str(float(false_false)/float(false))

	result = clf.score(testX, testY)
	print "Main precision for " + light_type + ": " + str(result)
	return result
コード例 #13
0
#binning model matrix,binned matrix==stat 
stat, bin_edges, binnum = stats.binned_statistic(range(X.shape[1]), X, 'median', bins=int(bin_num))

#MODULO3
#APPLY THE MODEL AND PRINT THE RESULT

if model == 'svm.LinearSVC()':
    clf=svm.LinearSVC(C=parameter)
if model == 'RandomForestClassifier()': 
    clf=RandomForestClassifier(n_estimators=parameters,n_jobs=-1)
if model == 'LinearDiscriminantAnlysis()': 
    clf=LinearDiscriminantAnalysis()

out = clf.fit(stat,y)
output = clf.fit_transform(stat, y)

'''
# Plot SVM contour
# Can't plot it because it is 160 dimensions
h = .02  # step size in the mesh
x_min, x_max = output[:, 0].min() - 1, output[:, 0].max() + 1
y_min, y_max = output[:, 1].min() - 1, output[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
arr_conc = np.c_[xx.ravel(), yy.ravel()] # concatenate two arrays together
#print (arr_conc)
#print (arr_conc.shape)
Z = clf.predict(arr_conc)
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.fit_transform(X_test)

# _________________________________________________

# Matrix
from sklearn.metrics import accuracy_score, log_loss, classification_report, confusion_matrix, roc_curve, auc

from pandas_ml import ConfusionMatrix  # I'm using 'pandas_ml' for better confusion matrix than 'scikit-learn'.
# ______________________________________________________________________________________________________________

# Machine Learning Classifiers
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit_transform(X_train, y_train)
y_artificial = model.predict(X_test)  # Predicted

# Evaluation

TN, FP, FN, TP = confusion_matrix(y_true=y_test, y_pred=y_artificial).ravel()
print('_' * 43)
print('Classifier : GradientBoostingClassifier')
print('Accuracy : {0:.3f} %'.format(
    accuracy_score(y_true=y_test, y_pred=y_artificial) * 100.0))
# print('_'*40)

print()
print('Confusion Matrix :')
CM = ConfusionMatrix(y_true=y_test, y_pred=y_artificial)
print(CM)
コード例 #15
0
import numpy as np
from feature_set import get_all_feature_sets

setts = get_all_feature_sets()

keys = []
scores = []

for key in setts:

    keys.append(key)

    print("Loading %sand associated labels..." % key)

    examples = setts[key]['train_X']
    labels = setts[key]['train_Y']
    test_examples = setts[key]['test_X']

    test_labels = setts[key]['test_Y']
    Forest = RandomForestClassifier(100)

    Forest.fit_transform(examples, labels)

    print Forest.score(test_examples, test_labels)

    scores.append(Forest.score(test_examples, test_labels))

with open("../data/random_forest_all_feature_combinations.txt", "wb") as f:
    for i in range(len(keys)):
        f.write("%s, %d\n" % (keys[i], scores[i]))
コード例 #16
0
    X = scale_data(X)
    print("Features Data scaled")

#    SGD = SGDClassifier(penalty='elasticnet',class_weight='auto',n_jobs=-1,n_iter=35,l1_ratio =0.2)
    svc = LinearSVC(class_weight='auto')
    model_rf = RandomForestClassifier(n_jobs=-1, bootstrap=True, n_estimators=180,
                                        min_samples_leaf=3, min_samples_split =3,
                                        criterion='gini',compute_importances=True, max_depth=6)

    SVC_RBF= SVC(kernel="rbf", class_weight="auto", cache_size=2600, shrinking=True)
    SVC_linear= SVC(kernel="poly", cache_size=2700, shrinking=True)


    # model_rf.fit(X,y)
    # X_SGD = model_rf.transform(X, threshold='1.5*mean') # forests!
    X_SGD = model_rf.fit_transform(X,y)
    print('X Reduced (by RF) features amount:')
    print(X_SGD.shape)

    def ReducedFeaturesDF(X,y):
        '''
        Returns a dataframe with only a subset of features/columns retained
        '''
        from sklearn.feature_selection import RFE
        est = LinearSVC( penalty='l1', loss='l2', dual=False, class_weight='auto')
#        selectK = SelectKBest(score_func = f_classif, k=45)
        selectRFE = RFE(estimator=est, n_features_to_select=22, step=0.15)
        selectK=selectRFE

        selectK.fit(X,y)
        selectK_mask=selectK.get_support()
コード例 #17
0
ファイル: random_forest.py プロジェクト: jayelm/bdbcontrib
class RandomForest(predictor.IBayesDBForeignPredictor):
    """A Random Forest foreign predictor.

    The `targets` must be a single categorical stattype.  The `conditions`
    may be arbitrary numerical or categorical columns.
    """

    @classmethod
    def create(cls, bdb, table, targets, conditions):
        cols = [c for c,_ in targets+conditions]
        df = bdbcontrib.table_to_df(bdb, table, cols)
        rf = cls()
        rf.train(df, targets, conditions)
        rf.prng = bdb.np_prng
        return rf

    @classmethod
    def serialize(cls, _bdb, pred):
        state = {
            'targets': pred.targets,
            'conditions_numerical': pred.conditions_numerical,
            'conditions_categorical': pred.conditions_categorical,
            'rf_full': pred.rf_full,
            'rf_partial': pred.rf_partial,
            'categories_to_val_map': pred.categories_to_val_map
        }
        return pickle.dumps(state)

    @classmethod
    def deserialize(cls, bdb, binary):
        state = pickle.loads(binary)
        rf = cls(targets=state['targets'],
            conditions_numerical=state['conditions_numerical'],
            conditions_categorical=state['conditions_categorical'],
            rf_full=state['rf_full'], rf_partial=state['rf_partial'],
            categories_to_val_map=state['categories_to_val_map'])
        rf.prng = bdb.np_prng
        return rf

    @classmethod
    def name(cls):
        return 'random_forest'

    def __init__(self, targets=None, conditions_numerical=None,
            conditions_categorical=None, rf_full=None, rf_partial=None,
            categories_to_val_map=None):
        self.targets = targets
        self.conditions_numerical = conditions_numerical
        self.conditions_categorical = conditions_categorical
        if (conditions_numerical is not None
                and conditions_categorical is not None):
            self.conditions = self.conditions_numerical + \
                self.conditions_categorical
        self.rf_full = rf_full
        self.rf_partial = rf_partial
        self.categories_to_val_map = categories_to_val_map

    def train(self, df, targets, conditions):
        # Obtain the targets column.
        if len(targets) != 1:
            raise BLE(ValueError('RandomForest requires exactly one column in '
                'targets. Received {}'.format(targets)))
        if targets[0][1].lower() != 'categorical':
            raise BLE(ValueError('RandomForest can only classify CATEGORICAL '
                'columns. Received {}'.format(targets)))
        self.targets = [targets[0][0]]
        # Obtain the condition columns.
        if len(conditions) < 1:
            raise BLE(ValueError('RandomForest requires at least one column in '
                'conditions. Received {}'.format(conditions)))
        self.conditions_categorical = []
        self.conditions_numerical = []
        for c in conditions:
            if c[1].lower() == 'categorical':
                self.conditions_categorical.append(c[0])
            else:
                self.conditions_numerical.append(c[0])
        self.conditions = self.conditions_numerical + \
            self.conditions_categorical
        # The dataset.
        self.dataset = pd.DataFrame()
        # Lookup for categoricals to code.
        self.categories_to_val_map = dict()
        # Training set (regressors and labels)
        self.X_numerical = np.ndarray(0)
        self.X_categorical = np.ndarray(0)
        self.Y = np.ndarray(0)
        # Random Forests.
        self.rf_partial = RandomForestClassifier(n_estimators=100)
        self.rf_full = RandomForestClassifier(n_estimators=100)
        # Preprocess the data.
        self.dataset = utils.extract_sklearn_dataset(self.conditions,
            self.targets, df)
        self.categories_to_val_map = utils.build_categorical_to_value_map(
            self.conditions_categorical, self.dataset)
        self.X_categorical = utils.extract_sklearn_features_categorical(
            self.conditions_categorical, self.categories_to_val_map,
            self.dataset)
        self.X_numerical = utils.extract_sklearn_features_numerical(
            self.conditions_numerical, self.dataset)
        self.Y = utils.extract_sklearn_univariate_target(self.targets,
            self.dataset)
        # Train the random forest.
        self._train_rf()

    def _train_rf(self):
        """Trains the random forests classifiers.

        We train two classifiers, `partial` which is just trained on
        `conditions_numerical`, and `full` which is trained on
        `conditions_numerical+conditions_categorical`.

        This safe-guard feature is critical for querying; otherwise sklearn
        would crash whenever a categorical value unseen in training due to
        filtering (but existant in df nevertheless) was passed in.
        """
        # pylint: disable=no-member
        self.rf_partial.fit_transform(self.X_numerical, self.Y)
        self.rf_full.fit_transform(
            np.hstack((self.X_numerical, self.X_categorical)), self.Y)

    def _compute_targets_distribution(self, conditions):
        """Given conditions dict {feature_col:val}, returns the
        distribution and (class mapping for lookup) of the random label
        self.targets|conditions.
        """
        if not set(self.conditions).issubset(set(conditions.keys())):
            raise BLE(ValueError(
                'Must specify values for all the conditionals.\n'
                'Received: {}\n'
                'Expected: {}'.format(conditions, self.conditions_numerical +
                self.conditions_categorical)))

        # Are there any category values in conditions which never appeared during
        # training? If yes, we need to run the partial RF.
        unseen = any([conditions[cat] not in self.categories_to_val_map[cat]
            for cat in self.conditions_categorical])

        X_numerical = [conditions[col] for col in self.conditions_numerical]
        if unseen:
            distribution = self.rf_partial.predict_proba(X_numerical)
            classes = self.rf_partial.classes_
        else:
            X_categorical = [conditions[col] for col in
                self.conditions_categorical]
            X_categorical = utils.binarize_categorical_row(
                self.conditions_categorical, self.categories_to_val_map,
                X_categorical)
            distribution = self.rf_full.predict_proba(
                np.hstack((X_numerical, X_categorical)))
            classes = self.rf_partial.classes_
        return distribution[0], classes

    def simulate(self, n_samples, conditions):
        distribution, classes = self._compute_targets_distribution(conditions)
        draws = self.prng.multinomial(1, distribution, size=n_samples)
        return [classes[np.where(d==1)[0][0]] for d in draws]

    def logpdf(self, value, conditions):
        distribution, classes = self._compute_targets_distribution(conditions)
        if value not in classes:
            return -float('inf')
        return np.log(distribution[np.where(classes==value)[0][0]])
コード例 #18
0
# # Import metrics for scoring
#==============================================================================

from sklearn.metrics import classification_report

print(classification_report(test_y, bnb_preds))
print(classification_report(test_y, gnb_preds))
print(classification_report(test_y, mnb_preds))

#%% Random Forest
from sklearn.ensemble import RandomForestClassifier

# Initialize a random forest
rc = RandomForestClassifier(n_estimators=100,
                            max_features='auto',
                            max_depth=None,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            verbose=1,
                            warm_start=False,
                            class_weight=None)

# Train the forest
rc.fit_transform(train_X, train_y)

# Score the forest
rc_preds = rc.predict(test_X)

# Asses
print(classification_report(test_y, rc_preds))
コード例 #19
0
ファイル: randomforest1.py プロジェクト: datu925/codomator
StartRow = 1
StartTest = 12000
EndTest = 13000

train_file = [fin[x] for x in xrange(StartRow, StartTest) if operating[x] == 'PreK-12 Operating']
train_targets = [targets[x] for x in xrange(StartRow, StartTest) if operating[x] == 'PreK-12 Operating']
test_file = [fin[x] for x in xrange(StartTest, EndTest)]


#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1,charset_error='ignore')

X_train = vectorizer.fit_transform(train_file)
X_train = X_train.todense()

#from sklearn.naive_bayes import MultinomialNB
#from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
#clf = MultinomialNB()
#clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True)
clf = RandomForestClassifier(n_estimators = 500, compute_importances=True)
#MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
clf.fit_transform(X_train, train_targets)

testdata = vectorizer.transform(test_file)
testdata = testdata.todense()


output_predictions('predictions.csv',testdata, clf)
コード例 #20
0
ファイル: benchmark1.py プロジェクト: brandoncwn/Gartner
from sklearn.metrics import classification_report

print(classification_report(test_y, bnb_preds))
print(classification_report(test_y, gnb_preds))
print(classification_report(test_y, mnb_preds))

#%% Random Forest
from sklearn.ensemble import RandomForestClassifier

# Initialize a random forest
rc = RandomForestClassifier(n_estimators = 100,
                            max_features = 'auto',
                            max_depth = None,
                            min_samples_split = 2,
                            min_samples_leaf = 1,
                            verbose = 1,
                            warm_start = False,
                            class_weight = None)

# Train the forest
rc.fit_transform(train_X, train_y)

# Score the forest
rc_preds = rc.predict(test_X)

# Asses
print(classification_report(test_y, rc_preds))


コード例 #21
0
test_columns = ['id', 'k1k2', 'locks_signal', 'emergency_signal', 'access_signal', 'THDV_M', 'THDI_M']
# 读入数据
train = pd.read_csv("/home/zhoujifa/competition/baidu_elecrticity/data_train.csv", names=train_columns)
test = pd.read_csv("/home/zhoujifa/competition/baidu_elecrticity/data_test.csv", names=test_columns)

train = shuffle(train)

# 用sklearn.cross_validation进行训练数据集划分,这里训练集和交叉验证集比例为7:3,可以自己根据需要设置
train_xy, val = train_test_split(train, test_size=0.3, random_state=1)

y = train_xy.label
X = train_xy.drop(['label', 'id', 'THDV_M', 'THDI_M'], axis=1)
val_y = val.label
val_X = val.drop(['label', 'id', 'THDV_M', "THDI_M"], axis=1)

classifier = RandomForestClassifier()
classifier.fit_transform(X, y)
predict_y = classifier.predict(val_X)

accuracy_score = accuracy_score(val_y, predict_y)

y_score = classifier.score(X, y)
# auc_score = roc_auc_score(val_y, y_score)

print(accuracy_score)
test = test.drop(['id', 'THDV_M', "THDI_M"], axis=1)
preds = classifier.predict(test)

np.savetxt('/home/zhoujifa/competition/baidu_elecrticity/rf_submission.csv', np.c_[range(1, len(test) + 1), preds], delimiter=',', header='ImageId,Label', comments='', fmt='%d')

print(classifier.feature_importances_)
コード例 #22
0
ファイル: Model_Parameters_CV.py プロジェクト: zjx1230/ProFET
                                      n_estimators=180,
                                      min_samples_leaf=3,
                                      min_samples_split=3,
                                      criterion='gini',
                                      compute_importances=True,
                                      max_depth=6)

    SVC_RBF = SVC(kernel="rbf",
                  class_weight="auto",
                  cache_size=2600,
                  shrinking=True)
    SVC_linear = SVC(kernel="poly", cache_size=2700, shrinking=True)

    # model_rf.fit(X,y)
    # X_SGD = model_rf.transform(X, threshold='1.5*mean') # forests!
    X_SGD = model_rf.fit_transform(X, y)
    print('X Reduced (by RF) features amount:')
    print(X_SGD.shape)

    def ReducedFeaturesDF(X, y):
        '''
        Returns a dataframe with only a subset of features/columns retained
        '''
        from sklearn.feature_selection import RFE
        est = LinearSVC(penalty='l1',
                        loss='l2',
                        dual=False,
                        class_weight='auto')
        #        selectK = SelectKBest(score_func = f_classif, k=45)
        selectRFE = RFE(estimator=est, n_features_to_select=22, step=0.15)
        selectK = selectRFE
コード例 #23
0
ファイル: getfea.py プロジェクト: ybdesire/malware_analysis
from sklearn.ensemble import RandomForestClassifier
import pickle
import sys
import numpy as np

X1=np.array(pickle.load(open('X2g_train.p', 'rb')))
X2=np.array(pickle.load(open('X3g_train.p', 'rb')))
X3=np.array(pickle.load(open('X4g_train.p', 'rb')))
X4=np.array(pickle.load(open('Xhead_train.p', 'rb')))

X=np.hstack((X2,X1,X3,X4))
y=np.array(pickle.load(open('y.p', 'rb')))
rf=RandomForestClassifier(n_estimators=200)
Xr=rf.fit_transform(X,y)
pickle.dump(Xr,open('X33_train_reproduce.p','wb'))
print(Xr.shape)
del X,X1,X2,X3,X4,Xr

X1=np.array(pickle.load(open('X2g_test.p', 'rb')))
X2=np.array(pickle.load(open('X3g_test.p', 'rb')))
X3=np.array(pickle.load(open('X4g_test.p', 'rb')))
X4=np.array(pickle.load(open('Xhead_test.p', 'rb')))
X=np.hstack((X2,X1,X3,X4))
Xr=rf.transform(X)
pickle.dump(Xr,open('X33_test_reproduce.p','wb'))
print(Xr.shape)
コード例 #24
0
D = pairwise_distances(X)
D.shape

#visualize
plt.imshow(D, zorder=2, cmap='Blues', interpolation='nearest')
plt.colorbar()

#distance matrix for rotated and translated data
D2 = pairwise_distances(X2)
np.allclose(D, D2)
#This distance matrix gives us a representation of our data that is invariant to rotations and translations

#transform back into x and y coordinates not always intuitive - using only distance matrix...
from sklearn.manifold import MDS
model = MDS(n_components=2, dissimilarity='precomputed', random_state=1)
out = model.fit_transform(D)
plt.scatter(out[:, 0], out[:, 1], **colorize)
plt.axis('equal')


#The usefulness of this becomes more apparent when we consider the fact that distance matrices can be computed from data in any dimension
def random_projection(X, dimension=3, rseed=42):
    assert dimension >= X.shape[1]
    rng = np.random.RandomState(rseed)
    C = rng.randn(dimension, dimension)
    e, V = np.linalg.eigh(np.dot(C, C.T))
    return np.dot(X, V[:X.shape[1]])


X3 = random_projection(X, 3)
X3.shape
コード例 #25
0
             title_cluster = 'Chess KM Clusters'
             file_cluster = 'plots/chess_km_cluster.png'
         km = KMeans(n_clusters=clusters, random_state=0).fit(dr_chess_trgX)
         # Plot histogram of cluster purities
         plot.plot_tsne(tx_data, km.predict(dr_chess_trgX), title_cluster,
                        file_cluster)
         plot.plot_tsne(tx_data, chess_trgY, "Chess KM Real Labels",
                        "plots/chess_km_cluster_real_labels.png")
     if run_em:
         clusters = data_funcs.best_cluster_count("chess", "em")
         if best_dr:
             comp_count = data_funcs.best_comp_count("chess", "PCA")
             title_cluster = 'Chess EM Clusters with PCA DR'
             file_cluster = 'plots/chess_em_cluster_w_PCA_DR.png'
             dr = PCA(n_components=comp_count, random_state=0)
             dr_chess_trgX = dr.fit_transform(chess_trgX)
         else:
             dr_chess_trgX = chess_trgX
             title_cluster = 'Chess EM Clusters'
             file_cluster = 'plots/chess_em_cluster.png'
         em = GaussianMixture(n_components=clusters,
                              random_state=0).fit(dr_chess_trgX)
         plot.plot_tsne(tx_data, em.predict(dr_chess_trgX), title_cluster,
                        file_cluster)
         plot.plot_tsne(tx_data, chess_trgY, "Chess EM Real Labels",
                        "plots/chess_em_cluster_real_labels.png")
 if run_fmnist:
     tx_data = TSNE(random_state=0).fit_transform(fmnist_trgX)
     if run_kmeans:
         clusters = data_funcs.best_cluster_count("fmnist", "km")
         if best_dr:
コード例 #26
0
del labels

# Parameters for Randomforest
random_state = 5342
n_jobs = 8
verbose = 2
clf1 = ExtraTreesClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)
clf2 = ExtraTreesClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)
clf3 = RandomForestClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)
clf4 = RandomForestClassifier(criterion='entropy', random_state=random_state, n_jobs=n_jobs, verbose=verbose)

# Start training
print('training started')
clf1.fit(train[:, :-1], train[:, -1])
X_new1 = clf1.transform(train[:, :-1])
X_new2 = clf3.fit_transform(train[:, :-1], train[:, -1])
# print('importances', clf1.feature_importances_)
clf2.fit(X_new1, train[:, -1])
clf4.fit(X_new2, train[:, -1])
print('training completed')

print('n_components = ', len(X_new1[0]), len(X_new2[0]))

# We don't need training set now
del train

# Dimensions for train set
ntest = 10873
nfeature = 16 ** 2 + 1  # For two_byte_codes, no_que_marks
test = np.zeros((ntest, nfeature), dtype=int)
Ids = []  # Required test set ids
コード例 #27
0
from sklearn.ensemble import RandomForestClassifier
import pickle
import sys
import numpy as np

X1 = np.array(pickle.load(open('X2g_train.p')))
X2 = np.array(pickle.load(open('X3g_train.p')))
X3 = np.array(pickle.load(open('X4g_train.p')))
X4 = np.array(pickle.load(open('Xhead_train.p')))

X = np.hstack((X2, X1, X3, X4))
y = np.array(pickle.load(open('y.p')))
rf = RandomForestClassifier(n_estimators=200)
Xr = rf.fit_transform(X, y)
pickle.dump(Xr, open('X33_train_reproduce.p', 'w'))
print Xr.shape
del X, X1, X2, X3, X4, Xr

X1 = np.array(pickle.load(open('X2g_test.p')))
X2 = np.array(pickle.load(open('X3g_test.p')))
X3 = np.array(pickle.load(open('X4g_test.p')))
X4 = np.array(pickle.load(open('Xhead_test.p')))
X = np.hstack((X2, X1, X3, X4))
Xr = rf.transform(X)
pickle.dump(Xr, open('X33_test_reproduce.p', 'w'))
print Xr.shape