def loadData(path="../data/",k=5,log='add',pca_n=0,SEED=34):
	from pandas import DataFrame, read_csv
	from numpy import log as ln
	from sklearn.cross_validation import KFold
	from sklearn.preprocessing import LabelEncoder
	from sklearn.preprocessing import StandardScaler
	train = read_csv(path+"train.csv")
	test = read_csv(path+"test.csv")
	id = test.id
	target = train.target
	encoder = LabelEncoder()
	target_nnet = encoder.fit_transform(target).astype('int32')
	feat_names = [x for x in train.columns if x.startswith('feat')]
	train = train[feat_names].astype(float)
	test = test[feat_names]
	if log == 'add':
		for v in train.columns:
			train[v+'_log'] = ln(train[v]+1)
			test[v+'_log'] = ln(test[v]+1)
	elif log == 'replace':
		for v in train.columns:
			train[v] = ln(train[v]+1)
			test[v] = ln(test[v]+1)      
	if pca_n > 0:
		from sklearn.decomposition import PCA
		pca = PCA(pca_n)
		train = pca.fit_transform(train)
		test = pca.transform(test)
	scaler = StandardScaler()
	scaler.fit(train)
	train = DataFrame(scaler.transform(train),columns=['feat_'+str(x) for x in range(train.shape[1])])
	test = DataFrame(scaler.transform(test),columns=['feat_'+str(x) for x in range(train.shape[1])])
	cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED)
	return train, test, target, target_nnet, id, cv, encoder
Esempio n. 2
0
def prepare_items_features(user_items_csv, out_dir):
    array = np.loadtxt(user_items_csv, delimiter='|',
            dtype=np.dtype(np.uint64))

    le = LabelEncoder()
    col1 = le.fit_transform(array[:, 1].T)
    col2 = le.fit_transform(array[:, 2].T)
    col3 = le.fit_transform(array[:, 3].T)
    col4 = le.fit_transform(array[:, 4].T)

    columns = np.array([col1, col2, col3, col4]).T
    enc = OneHotEncoder()
    print(array[:10])
    encoded = np.c_[array[:, 0], enc.fit_transform(columns).toarray()]
    print(encoded[:10])
    print(encoded.shape)

    user_id = encoded[0][0]
    rows = []
    current = np.zeros(encoded.shape[1]-1)
    for i in range(encoded.shape[0]):
        if encoded[i][0] != user_id:
            rows.append(np.concatenate([[user_id], current]))
            user_id = encoded[i][0]
            current = np.zeros(encoded.shape[1]-1)
        else:
            current = np.sum([current, encoded[i, 1:]], axis=0)
    rows.append(np.concatenate([[user_id], current]))

    array = np.array(rows)
    print(array.shape)

    # let's serialize array
    np.save(os.path.join(out_dir, "user_items"), array)
Esempio n. 3
0
    def transformTestData(self, train_data, test_data):
        #Select the right features for both training and testing data
        X_train, y_train = self.__selectRelevantFeatures(train_data)
        X_test, y_test = self.__selectRelevantFeatures(test_data)

        #Transform categorical variables into integer labels
        martial_le = LabelEncoder()
        occupation_le = LabelEncoder()
        relationship_le = LabelEncoder()
        race_le = LabelEncoder()
        sex_le = LabelEncoder()
        transformers = [martial_le, occupation_le, relationship_le, race_le, sex_le]

        for i in range(len(transformers)):
            X_train[:,i] = transformers[i].fit_transform(X_train[:,i])
            X_test[:,i] = transformers[i].transform(X_test[:,i])

        #Dummy code categorical variables
        dummy_code = OneHotEncoder(categorical_features = range(5))
        X_train = dummy_code.fit_transform(X_train).toarray()
        X_test = dummy_code.transform(X_test).toarray()

        #Normalize all features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        #Encode y
        class_le = LabelEncoder()
        y_train = class_le.fit_transform(y_train)
        y_test = class_le.transform(y_test)
        #print class_le.transform(["<=50K", ">50K"])

        return X_train, X_test, y_train, y_test
Esempio n. 4
0
def process_one_cell(df_train, df_test, grid_id, th):
    """
    Classification inside one grid cell.
    """
    # Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    # Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index

    # Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int)

    # Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance',
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
    return pred_labels, row_ids
Esempio n. 5
0
    def train(cls, X, y, word_sim_metric, classifier=LinearSVC,
              feature_num=10, feature_type='sim', verbose=True):

        if isinstance(classifier, type):
            classifier = classifier()

        labels = LabelEncoder()
        y_train = labels.fit_transform(y)

        @timeit
        def build():

            corpus = zip(X, y)
            model = Pipeline([
                ('preprocessor', TextPreprocessor(corpus, word_sim_metric, feature_num, feature_type)),
                ('vectorizer', DictVectorizer()),
                ('classifier', classifier),
            ])

            model.fit(X, y_train)
            return model

        if verbose: print("Building the model")
        model, secs = build()
        if verbose: print("Complete model building in {:0.3f} seconds".format(secs))

        return cls(labels, model)
Esempio n. 6
0
def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv',  targetcolumn = 'pointGroup', md = None):
    """
    Build a random forest-classifier model to predict some structure feature from compositional data.  Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object
    """
    df = pd.read_csv(structurestable)
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    s = StandardScaler()
    le = LabelEncoder()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = le.fit_transform(df[targetcolumn].values)

    rfc = RandomForestClassifier(max_depth = md)
    acc = mean(cross_val_score(rfc, X, y))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfc.fit(X_train,y_train)
    y_predict = rfc.predict(X_test)
    cm = confusion_matrix(y_test, y_predict)
    
    cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_)

    rfc.fit(X, y)

    return rfc, cm, round(acc,2), le
def load_train_data(path):
    print("Loading Train Data")
    df = pd.read_csv(path)
    
    
    # Remove line below to run locally - Be careful you need more than 8GB RAM 
    rows = np.random.choice(df.index.values, 40000)
    df = df.ix[rows]
    # df = df.sample(n=40000)
    # df = df.loc[df.index]
    
    labels = df.target

    df = df.drop('target',1)
    df = df.drop('ID',1)
    
    # Junk cols - Some feature engineering needed here
    df = df.fillna(-1)

    X = df.values.copy()
    
    np.random.shuffle(X)

    X = X.astype(np.float32)
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels).astype(np.int32)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, y, encoder, scaler
Esempio n. 8
0
def load_otto_group():
    """
    Loads and returns several variables for the data set from Kaggle's Otto Group Product Classification competition.
    Link: https://www.kaggle.com/c/otto-group-product-classification-challenge

    Returns
    ----------
    data : array-like
        Pandas data frame containing the entire data set.

    X : array-like
        Training input samples.

    y : array-like
        Target values.
    """
    file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'otto_group.zip')
    z = ZipFile(file_location)
    data = pd.read_csv(z.open('train.csv'))
    data = data.set_index('id')

    # move the label to the first position
    cols = data.columns.tolist()
    cols = cols[-1:] + cols[0:-1]
    data = data[cols]

    X = data.iloc[:, 1:].values

    y = data.iloc[:, 0].values

    # transform the labels from strings to integers
    encoder = LabelEncoder()
    y = encoder.fit_transform(y)

    return data, X, y
Esempio n. 9
0
 def auto_alpha2num(self, col):
     from sklearn.preprocessing import LabelEncoder
     
     le = LabelEncoder()
     for i in col:
         self.df[i] = le.fit_transform(self.df[i])
     return 
def process_one_cell(df_cell_train, df_cell_test):
    
    #Working on df_train
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 8).values
    df_cell_train = df_cell_train.loc[mask]
    
    #Working on df_test
    row_ids = df_cell_test.index
    
    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= 500.0
    df_cell_train.loc[:,'y'] *= 1000.0
    df_cell_test.loc[:,'x'] *= 500.0
    df_cell_test.loc[:,'y'] *= 1000.0
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    X_test = df_cell_test.values

    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) 
    
    return pred_labels, row_ids
Esempio n. 11
0
class Classifier(BaseEstimator):
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.clf = None        
 
    def fit(self, X, y):        
        X = self.scaler.fit_transform(X.astype(np.float32))              
        y = self.label_encoder.fit_transform(y).astype(np.int32)
        dtrain = xgb.DMatrix( X, label=y.astype(np.float32))
        
        param = {'objective':'multi:softprob', 'eval_metric':'mlogloss'}
        param['nthread'] = 4
        param['num_class'] = 9
        param['colsample_bytree'] = 0.55
        param['subsample'] = 0.85
        param['gamma'] = 0.95
        param['min_child_weight'] = 3.0
        param['eta'] = 0.05
        param['max_depth'] = 12
        num_round = 400 # to be faster ??  
        #num_round = 820
        
        self.clf = xgb.train(param, dtrain, num_round)  
 
    def predict(self, X):
        X = self.scaler.transform(X.astype(np.float32))
        dtest = xgb.DMatrix(X)       
        label_index_array = np.argmax(self.clf.predict(dtest), axis=1)
        return self.label_encoder.inverse_transform(label_index_array)
 
    def predict_proba(self, X):
        X = self.scaler.transform(X.astype(np.float32))
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
Esempio n. 12
0
def test_multiclass_classifier_class_weight():
    """tests multiclass with classweights for each class"""
    alpha = .1
    n_samples = 20
    tol = .00001
    max_iter = 50
    class_weight = {0: .45, 1: .55, 2: .75}
    fit_intercept = True
    X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0,
                      cluster_std=0.1)
    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
    classes = np.unique(y)

    clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
                              max_iter=max_iter, tol=tol, random_state=77,
                              fit_intercept=fit_intercept,
                              class_weight=class_weight)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    le = LabelEncoder()
    class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
    sample_weight = class_weight_[le.fit_transform(y)]

    coef1 = []
    intercept1 = []
    coef2 = []
    intercept2 = []
    for cl in classes:
        y_encoded = np.ones(n_samples)
        y_encoded[y != cl] = -1

        spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight)
        spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight,
                                              sparse=True)
        coef1.append(spweights1)
        intercept1.append(spintercept1)
        coef2.append(spweights2)
        intercept2.append(spintercept2)

    coef1 = np.vstack(coef1)
    intercept1 = np.array(intercept1)
    coef2 = np.vstack(coef2)
    intercept2 = np.array(intercept2)

    for i, cl in enumerate(classes):
        assert_array_almost_equal(clf1.coef_[i].ravel(),
                                  coef1[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1)

        assert_array_almost_equal(clf2.coef_[i].ravel(),
                                  coef2[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
Esempio n. 13
0
def load_kernel_matrix(data_path='data', study='wl_kernel', verbose=True):
    """Loading already computed kernel matrix.
    Parameters:
    ---------
    data_path: string
        Path to the data folder.
    study: string
        Name of the folder containing the study, e.g. 'wl_kernel', which
        contains the WL kernel matrix.
    verbose: bool
    """
    path_k_matrix = os.path.join(data_path, 'precomputed_kernels',
                                 study, 'k_matrix.csv')
    path_cls = os.path.join(data_path, 'precomputed_kernels', study,
                            'class_labels.csv')

    K = np.loadtxt(path_k_matrix)
    y = np.loadtxt(path_cls)

    le = LabelEncoder()
    y = le.fit_transform(y)

    if verbose:
        print 'n_samples: %s, n_samples_by_class: (%s - %s)' % (len(y),
                                                                len(y[y == 0]),
                                                                len(y[y == 1]))

    return K, y
Esempio n. 14
0
def main():
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')

    enc = LabelEncoder()
    joined = pd.concat((train['Product_Info_2'],
                        test['Product_Info_2']), axis=0)
    enc.fit(joined)
    train['Product_Info_2'] = enc.transform(train['Product_Info_2'])
    test['Product_Info_2'] = enc.transform(test['Product_Info_2'])


    X_train = train.drop('Response', axis=1).values
    y_train = train['Response'].values
    X_test = test.values

    mdl = xgb.XGBRegressor(learning_rate=0.05,
                           n_estimators=200,
                           subsample=0.5,
                           max_depth=6,
                           silent=False)
    mdl.fit(X_train, y_train)

    preds = mdl.predict(X_test)
    preds = [min(max(1, int(round(pred))), 8) for pred in preds]

    sub = pd.DataFrame({'Id': test['Id'], 'Response': preds})
    sub.to_csv('submissions/xgb.csv', index=False)
Esempio n. 15
0
def main(X_fname, Y_fname, result_fname=None): 
    le = LabelEncoder()
    moves = pandas.read_csv(Y_fname, index_col=0)
    Y = moves.values.ravel()
    Y = le.fit_transform(Y)
    X = io.mmread(X_fname)
    print X.shape, Y.shape, len(le.classes_) 

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

    xg_train = xgboost.DMatrix( X_train, label=y_train)
    xg_test = xgboost.DMatrix(X_test, label=y_test)

    param = {}
    # use softmax multi-class classification
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.002
    param['max_depth'] = 7
    param['nthread'] = 7
    param['num_class'] = len(le.classes_)
    param['eval_metric'] = 'merror'
    evals = [ (xg_train, 'train'), (xg_test, 'eval') ]

    # Train xgboost
    print "Training"
    t1 = time.time()
    bst = xgboost.train(param, xg_train, 500, evals, early_stopping_rounds=3)
    t2 = time.time()
    print t2-t1

    if result_fname is None:
        result_fname = str(datetime.now())

    bst.save_model("%s.bst"%result_fname)
def plot_model_decision_surface(clf, train_features, train_labels,
                                plot_step=0.02, cmap=plt.cm.RdYlBu,
                                markers=None, alphas=None, colors=None):
    
    if train_features.shape[1] != 2:
        raise ValueError("X_train should have exactly 2 columnns!")
    
    x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step
    y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    clf_est = clone(clf)
    clf_est.fit(train_features,train_labels)
    if hasattr(clf_est, 'predict_proba'):
        Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
    else:
        Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()])    
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=cmap)
    
    le = LabelEncoder()
    y_enc = le.fit_transform(train_labels)
    n_classes = len(le.classes_)
    plot_colors = ''.join(colors) if colors else [None] * n_classes
    label_names = le.classes_
    markers = markers if markers else [None] * n_classes
    alphas = alphas if alphas else [None] * n_classes
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y_enc == i)
        plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color,
                    label=label_names[i], cmap=cmap, edgecolors='black', 
                    marker=markers[i], alpha=alphas[i])
    plt.legend()
    plt.show()
Esempio n. 17
0
    def load_input_files(self, **kwargs):
        
        """
        Loads both files containing training data and data for prediction. 
        
        Encodes the target labels to integers. 
        
        In case the it is training data, it will return in the output args the 
        LabelEncoder used to encode the target labels to integers. We return it 
        instead of directly storing it, because it will be saved in case the training
        ends without errors.
        
        Inputs:
        - files_paths (string): path the input files.
        - training_data (bool): specifies whether the files containing training
        data or data for making predictions.
        
        Outputs:
        - LabelEncoder (LabelEncoder) (optional): Encodes the labels of the target
        variables to integers.
        
        """
        
        input_data = kwargs['input_data']
        input_files_dir = kwargs['input_files_dir']

        input_file_path = input_files_dir + input_data['database']
        df = pd.read_csv(input_file_path)
        
        training_data = kwargs.pop('training_data', False)
        
        # if we are loading training data, we have to assign an integer to each possible
        # target label in the dataset. We do it by fitting a LabelEncoder,
        if training_data:

            le = LabelEncoder()
            col_name = df.columns[4]
            df[col_name] = le.fit_transform(df[col_name])

            data = {}
            data['features'] = df[df.columns[0:4]].values
            data['targets'] = df[df.columns[4]].values

            self.feature_names = list(df.columns[0:4])
            self.target_name = df.columns[4]    

            out_args = {}
            out_args['LabelEncoder'] = le
            
            return data, out_args

        # if the data is for making predictions
        else:
            data = {}
            # ensure that the columns are in the correct order
            data['features'] = df[self.feature_names].values
            
            out_args = {}
        
            return data, out_args
Esempio n. 18
0
def ml_target(dataset):
    """ Takes a dataset and retuns the target in a numpy.array ready for
    machine learning.
    Mainly transforms non-numerical variables(columns) to numbers.

    Parameters
    ----------
    copper.Dataset

    Returns
    -------
    (label_encoder, np.array)

    Notes
    -----
    If dataset has more than one variable with role=TARGET then the first one
    is selected.
    """
    cols = dataset.filter_cols(role=dataset.TARGET)
    assert len(cols) > 0, 'No target variables on Dataset'
    if len(cols) > 1:
        import warnings
        warnings.warn("Dataset contains more than one target, %s was choosed" % cols[0])

    if dataset[cols[0]].dtype in (np.int, np.float):
        return None, dataset[cols[0]].values
    else:
        le = LabelEncoder()
        encoded = le.fit_transform(dataset[cols[0]].values)
        return le, encoded
Esempio n. 19
0
    def __call__(self, X_train, X_test, y_train, y_test):
        X = np.vstack([X_train, X_test])
        y = np.hstack([y_train, y_test])
        le = LabelEncoder()
        y = le.fit_transform(y)

        kmeans = KMeans(
            n_clusters=len(np.unique(y)),
            n_init=self.kmeans__n_init,
            random_state=self.random_state,
        )
        kmeans.fit(X)

        r = distance.cdist(kmeans.cluster_centers_, kmeans.cluster_centers_)
        h = np.exp(-r / (self.sig**2))

        N = confusion_matrix(y, kmeans.labels_)

        wN = np.zeros(h.shape)
        for l in range(wN.shape[0]):  # label
            for c in range(wN.shape[0]):  # cluster
                for j in range(wN.shape[0]):
                    wN[l, c] += h[l, c] * N[l, j]

        return wN.max(axis=0).sum() / wN.sum()
Esempio n. 20
0
def multicol_fit_transform(dframe, columns):

	if isinstance(columns, list):
		columns = np.array(columns)
	else:
		columns = columns

	encoder_dict = {}
	# columns are provided, iterate through and get `classes_`
	# ndarray to hold LabelEncoder().classes_ for each
	# column; should match the shape of specified `columns`
	all_classes_ = np.ndarray(shape=columns.shape, dtype=object)
	all_encoders_ = np.ndarray(shape=columns.shape, dtype=object)
	all_labels_ = np.ndarray(shape=columns.shape, dtype=object)
	for idx, column in enumerate(columns):
		# instantiate LabelEncoder
		le = LabelEncoder()
		# fit and transform labels in the column
		dframe.loc[:, column] = le.fit_transform(dframe.loc[:, column].values)
		encoder_dict[column] = le
		# append the `classes_` to our ndarray container
		all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object))
		all_encoders_[idx] = le
		all_labels_[idx] = le

	multicol_dict = {"encoder_dict":encoder_dict, "all_classes_":all_classes_,"all_encoders_":all_encoders_,"columns": columns}
	return dframe, multicol_dict
Esempio n. 21
0
def train(args):
    print("Loading embeddings.")
    fname = "{}/labels.csv".format(args.workDir)
    labels = pd.read_csv(fname, header=None).as_matrix()[:, 1]
    labels = map(itemgetter(1),
                 map(os.path.split,
                     map(os.path.dirname, labels)))  # Get the directory.
    fname = "{}/reps.csv".format(args.workDir)
    embeddings = pd.read_csv(fname, header=None).as_matrix()
    le = LabelEncoder().fit(labels)
    labelsNum = le.transform(labels)

    param_grid = [
        {'C': [1, 10, 100, 1000],
            'kernel': ['linear']},
        {'C': [1, 10, 100, 1000],
            'gamma': [0.001, 0.0001],
            'kernel': ['rbf']}
    ]
    svm = GridSearchCV(
        SVC(probability=True),
        param_grid, verbose=4, cv=5, n_jobs=16
    ).fit(embeddings, labelsNum)
    print("Best estimator: {}".format(svm.best_estimator_))
    print("Best score on left out data: {:.2f}".format(svm.best_score_))

    with open("{}/classifier.pkl".format(args.workDir), 'w') as f:
        pickle.dump((le, svm), f)
Esempio n. 22
0
def load_data(filename="Feat_normalized.csv") :
    '''
    Load training data from csv file.  Load labels from it.
    Return matrix, training labels, encoder for labels.
    http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html
    http://stackoverflow.com/questions/21589177/using-multiple-features-with-scikit-learn?rq=1

    Labels could just be the names? : http://stackoverflow.com/questions/13300160/non-integer-class-labels-scikit-learn?rq=1
    '''
    df = pd.read_csv(filename, index_col=0)
    lb = LabelEncoder()
    labels = lb.fit_transform((df.index.values))

    print ("labels: %s %s" %(type(labels),labels))
    features = df.values
    # labels = LabelEncoder.transform(np.asarray(df['labels'].values))
    'This could be done more elegantly. Check index num for later filtering!!'
    'TODO: Is pop needed? List of col.values??  '

    feature_names=df.columns.values  #No pop. (nd array, no labels index here)
    print("%s features: " % (len(feature_names)))

    # classes = label_encoder.transform(np.asarray(df['labels']))
    print('encoded labels: %s' % (set(labels)))
    # print("feature_names: %s" %(feature_names))
    return (features, labels, lb,feature_names)
def prep_data(df_train,df_test,test_size=0.2):
    print(" ---- Start data prep")
    df_train = df_train.dropna(subset=['X1'])
    df_train['X1'] = (df_train['X1'].replace( '[\%,)]','',regex=True).replace( '[(]','-',   regex=True ).astype(float))
    labels = df_train['X1'].values
    id_test = df_test['X2']
    piv_train = df_train.shape[0]
    df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
    del df_all['X1'], df_all['X2'], df_all['X3'], df_all['X10'], df_all['X16'], df_all['X18']
    df_all['X23'] = df_all['X23'].map(lambda x: str(x)[:-3])
    df_all['X15'] = df_all['X15'].map(lambda x: str(x)[:-3])
    df_all['X4'] = (df_all['X4'].replace( '[\$,)]','', regex=True).replace( '[(]','-',   regex=True ).astype(float))
    df_all['X5'] = (df_all['X5'].replace( '[\$,)]','', regex=True).replace( '[(]','-',   regex=True ).astype(float))
    df_all['X6'] = (df_all['X6'].replace( '[\$,)]','', regex=True).replace( '[(]','-',   regex=True ).astype(float))
    df_all['X30'] = (df_all['X30'].replace( '[\%,)]','', regex=True).replace( '[(]','-',   regex=True ).astype(float))
    df_f = feature_engineering(df_all)
    vals = df_f.values
    X = vals[:piv_train]
    le = LabelEncoder()
    y = le.fit_transform(labels)   
    y = labels 
    X_test = vals[piv_train:]
    X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(X, y, test_size=0.2)
    print(" ---- end data prep")
    return X_train, X_valid, y_train, y_valid, X_test, id_test
Esempio n. 24
0
	def to_numeric(self, columns=[]):
		le = LabelEncoder()
		for i, c in enumerate(columns):
			le.fit(self.M[:, c])
			self.M[:, c] = le.transform(self.M[:, c])
		self.M = self.M.astype(np.float)
		return self
def test_vote_soft():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    train_probs = probs[0]
    test_probs = probs[1]
    print(len(train_probs))
    for prob in train_probs:
        print(prob.shape)
        print(type(prob))
    #train_attr = reduce(lambda a,b:a+b,train_probs)
    test_attr = reduce(lambda a,b:a+b,test_probs)

    pred = test_attr.idxmax(1)
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(y)
    pred = le.inverse_transform(pred)

    print(metrics.accuracy_score(test_Y,pred))
Esempio n. 26
0
    def train(self):
        input_dir = get_config().get('Classification', 'TrainingInputPath')
        self.logger.info("Loading features")
        file_name = os.path.join(input_dir, 'labels.csv')
        labels = pd.read_csv(file_name, header=None).as_matrix()[:, 1]
        labels = map(itemgetter(1),
                     map(os.path.split,
                         map(os.path.dirname, labels)))
        label_encoder = LabelEncoder().fit(labels)
        labels_encoded = label_encoder.transform(labels)
        num_classes = len(label_encoder.classes_)

        file_name = os.path.join(input_dir, 'reps.csv')
        features = pd.read_csv(file_name, header=None).as_matrix()

        self.logger.info("Training for {} classes.".format(num_classes))

        clf = SVC(C=1, kernel='linear', probability=True)

        # TODO: Try a previous LDA
        try:
            lda = int(get_config().get("Classification", "LDADim"))
        except ValueError:
            lda = None
        if lda:
            clf_final = clf
            clf = Pipeline([('lda', LDA(n_components=lda)),
                            ('clf', clf_final)])
        clf.fit(features, labels_encoded)

        file_name = os.path.join(input_dir, 'classifier.pkl')
        self.logger.info("Saving classifier to '{}'".format(file_name))
        with open(file_name, 'w') as f:
            pickle.dump((label_encoder, clf), f)
def labele(tbl,cols='all'):
    from sklearn.preprocessing import LabelEncoder as LE
    if cols=='all':cols=tbl.columns
    le=LE()
    for ac in tbl.columns:
        tbl.loc[:,ac]=le.fit(tbl[ac]).transform(tbl[ac]) #might have to return le
    return tbl
Esempio n. 28
0
def prepare_labels(y):
    # From here: https://www.kaggle.com/pestipeti/keras-cnn-starter
    values = np.array(y)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)

    return integer_encoded, label_encoder
Esempio n. 29
0
 def customEncode(df):
     global labelencoder
     le = LabelEncoder()
     le.fit(df['OutcomeType'])
     df['OutcomeType'] = le.transform(df['OutcomeType'])
     labelencoder = le
     return df
def test_hard_vote():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    #train_probs = probs[0]
    test_probs = probs[1]
    print(len(test_probs))
    preds = [x.idxmax(1) for x in test_probs]
    pred = np.zeros(len(preds[0]),dtype=np.int8)
    print(len(pred))
    for i in range(len(preds[0])):
        votes = [p[i] for p in preds]
        print(votes)
        pred[i]= max(set(votes),key=votes.count)
        print(pred[i])
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(y)
    pred = le.inverse_transform(pred)

    print(metrics.accuracy_score(test_Y,pred))

    """
Esempio n. 31
0
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


if __name__ == "__main__":
    pd.set_option('display.width', 300)

    data = pd.read_csv('../dataset/tel.csv', skipinitialspace=True, thousands=',')    # thousands : str, default None 千分位分割符,如“,”或者“."
    print u'原始数据:\n', data.head(10)

    # print 'data.columns() = \n', data.columns

    # 将每列数据按照类别做Label,比如Married和Unmarried这两个值分别用0和1取代
    le = LabelEncoder()     # 编码标签值介于0和n 比如有5类则标签为0/1/2/3/4
    for col in data.columns:
        data[col] = le.fit_transform(data[col])    # 符合标签编码的返回编码标签

    print u'处理后数据1:\n', data.head(10)

    # 年龄分组
    # 将age这列的数据按照给定的bins半开区间做标记,比如年龄在[-1,6)标记为0;[6,12)标记为1;[12,18)标记为2 ;这里标记可以自己指定,但要和bins的取值个数一样
    bins = [-1, 6, 12, 18, 24, 35, 50, 70]
    data['age'] = pd.cut(data['age'], bins=bins, labels=np.arange(len(bins)-1))    # cut函数:返回指数每个x的值所属半开的范围,并且用labels的值标记
    # print u'处理后2:\n', data['age']

    # 取对数
    columns_log = ['income', 'tollten', 'longmon', 'tollmon', 'equipmon', 'cardmon',
                   'wiremon', 'longten', 'tollten', 'equipten', 'cardten', 'wireten', ]
    mms = MinMaxScaler()    # 这个估计量尺度和单独翻译每个特性,使其在训练集在给定的范围内,即在0和1之间。
Esempio n. 32
0
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsRegressor

# df.
dfProcess = df[['MAKE', 'year', 'mileage', 'engine cc', 'selling_price']]
dfProcess['selling_price'] = pd.to_numeric(dfProcess['selling_price'])
data = dfProcess.values

# 1
X = data[:, 0:-1]
Y = data[:, -1]
Y.ravel()

# 2
encoder = LabelEncoder()
for i in range(X.shape[1]):
    X[:, i] = encoder.fit_transform(X[:, i])

# 3
minmax = MinMaxScaler()
X = minmax.fit_transform(X)

# 4
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.9,
                                                    random_state=42)

# 5
model = KNeighborsRegressor(n_neighbors=5)
Esempio n. 33
0
ds.loc[ds['SibSp'] == 1, ['Parch']] = 1
dssub.loc[dssub['SibSp'] == 1, ['Parch']] = 1
pid = sub.PassengerId.values
#handling missing values
#for Age
ds.Age = ds.Age.fillna(ds.Age.median())
dssub.Age = dssub.Age.fillna(dssub.Age.median())
#for Embarked
ds.Embarked = ds.Embarked.fillna('S')
dssub.Embarked = dssub.Embarked.fillna('S')

X_all = np.concatenate((X, X_sub), axis=0)
y = dataset.loc[:, 'Survived'].values
#Handling categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X_all[:, 1] = labelencoder_X.fit_transform(X_all[:, 1])
X_all[:, 5] = labelencoder_X.fit_transform(X_all[:, 5])
onehotencoder = OneHotEncoder(categorical_features=[0, 5])
X_all = onehotencoder.fit_transform(X_all).toarray()
X = X_all[:891, :]
X_sub = X_all[891:, :]
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.33,
                                                  random_state=1)

from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
Esempio n. 34
0
import pandas as pd
titanic=pd.read_csv('titanic.csv',encoding="shift-jis")
titanic=titanic.drop(['name','row.names'],axis=1)
mean=round(titanic['age'].mean(),2)
titanic['age'].fillna(mean,inplace=True)
titanic.fillna("",inplace=True)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in titanic.columns.values.tolist():
 if (i=='age'):
  pass
 else:
  titanic[i] = le.fit_transform(titanic[i])

from sklearn.model_selection import train_test_split
titanic_target = titanic['survived']
titanic_data=titanic.drop(['survived'],axis=1)
yX=titanic_target
yX=pd.concat([yX,titanic_data],axis=1)
yX.to_csv('temp.csv',encoding='utf-8')
X_train,X_test,y_train,y_test=train_test_split(titanic_data,titanic_target,test_size=0.2,random_state=54,shuffle=True)
from sklearn.ensemble import ExtraTreesClassifier
clf=ExtraTreesClassifier(n_estimators=382, max_depth=None,min_samples_split=2,random_state=8)
clf.fit(X_train,y_train)
print(clf.score(X_test,y_test))
dic=dict(zip(titanic_data.columns,clf.feature_importances_))
for item in sorted(dic.items(), key=lambda x: x[1], reverse=True):
    print(item[0],round(item[1],4))
Esempio n. 35
0
from matplotlib import pyplot as plt

%matplotlib inline

cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df.columns = [c.replace(' ', '_') for c in df.columns]
df['target'] = cancer.target
df['target'] = df.target.replace({0: 'malignant', 1: 'benign'})

# separation

target = 'target'
X = df.drop(target, axis=1)
y = df[target]
le = LabelEncoder()
y = le.fit_transform(y)

# experiment

feature = 'worst_concave_points'

p = 50
threshold = np.percentile(X[feature], 50)

feature_cuts = np.where(X[feature] > threshold, 'left', 'right')

decision = pd.DataFrame(zip(X[feature], feature_cuts, y), columns=['feature', 'cut', 'y'])
majority = decision.groupby('cut')['y'].mean()

# BUG: maybe could be the same, or maybe doesn't get rounded?
Esempio n. 36
0
import numpy as np
from seqlearn.evaluation import bio_f_score
from seqlearn.hmm import MultinomialHMM
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_validate

from data import *
from epam_nlp import CustomHMM, get_bio_f1

DATA_PATH = Path('../data')
RAW_DATA_PATH = DATA_PATH / 'processed.tsv'
df = load_data(RAW_DATA_PATH, nrows=1000)
X, y, lengths = get_X_y_lengths(df, cols_to_keep={'token'})
le = LabelEncoder()
ohe = OneHotEncoder(handle_unknown='ignore')
clf = CustomHMM(y=y)
pipeline = Pipeline([('one_hot', ohe), ('hmm', clf)])
cv = get_cv(lengths=lengths)

res = cross_validate(pipeline,
                     X.reshape(-1, 1),
                     y,
                     cv=cv,
                     n_jobs=1,
                     scoring=get_bio_f1)
print(res)

# cv = get_cv(X, y, lengths)
# i = 1
# scores = []
Esempio n. 37
0
#Importing header files
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

#Code starts here

#Removing `,` from the column
data['Installs']=data['Installs'].str.replace(',','')

#Removing `+` from the column
data['Installs']=data['Installs'].str.replace('+','')

#Converting the column to `int` datatype
data['Installs'] = data['Installs'].astype(int)

#Creating a label encoder object
le=LabelEncoder()

#Label encoding the column to reduce the effect of a large range of values
data['Installs']=le.fit_transform(data['Installs'])

#Setting figure size
plt.figure(figsize = (10,10))

#Plotting Regression plot between Rating and Installs
sns.regplot(x="Installs", y="Rating", color = 'teal',data=data)

#Setting the title of the plot
plt.title('Rating vs Installs[RegPlot]',size = 20)

#Code ends here
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values

# Replace Missing values

# missing data replace NaN with average
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer=imputer.fit(X[:,1:3])
X[:,1:3]=imputer.transform(X[:,1:3])

# categorical data encoding to integer variables

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X=LabelEncoder()
X[:,0]=labelencoder_X.fit_transform(X[:,0])
onehotEncoder= OneHotEncoder(categorical_features=[0])
X=onehotEncoder.fit_transform(X).toarray()
# Encoding the Dependent Variable if needed from Yes/No to 1 and 0

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
from sklearn.pipeline import Pipeline

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# load dataset
dataframe = read_csv('dataset/sonar.csv', header=None, delimiter=',')
dataframe = dataframe.values

# split into input (X) and output (Y) variables
X = dataframe[:, :-1].astype(float)
Y = dataframe[:, -1]

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)


# baseline model
def create_baseline():
    '''
    Larger
    60 inputs -> [60] -> 1 output

    Larger
    60 inputs -> [60 -> 30] -> 1 output

    Smaller
    60 inputs -> [30] -> 1 output
import pandas as pd

dataset = pd.read_csv('enter your dataset')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)
print(score)

from sklearn.model_selection import cross_val_score
score = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
data['DailyRate'].hist(bins=20)

sns.countplot(x= data['Attrition'], data = data, hue = data['Gender'])

sns.countplot(x = data['MaritalStatus'],hue=data['Attrition'] , data = data)

sns.barplot(y=data['JobRole'], x=data['JobSatisfaction'],estimator = np.mean, data = data)

"""Label Encoding"""

from sklearn.preprocessing import LabelEncoder

cat_object= ['Attrition', 'BusinessTravel', 'Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']

le = LabelEncoder()

for obj in cat_object:
    data[obj] = le.fit_transform(data[obj])
data.dtypes

corr = data.corr()
f,ax = plt.subplots(figsize=(16,9))
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values, square=True)

data.drop(['Over18'], axis=1, inplace= True)
k=10
cols=corr.nlargest(k,'Attrition')['Attrition'].index
cm= np.corrcoef(data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar = True ,annot = True,fmt ='.2f',annot_kws ={'size':10}, yticklabels=cols.values, xticklabels=cols.values )
Esempio n. 42
0
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4:5].values

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()

X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features=[3])
X = onehotencoder.fit_transform(X).toarray()

# Spliting into a training and a test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=0)

from sklearn.linear_model import LinearRegression as LR

regressor = LR()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

regressor.score(X_test, y_test)

plt.plot(X_test, y_test, color='g')
Esempio n. 43
0
def extract_features_from_model(base_model, model_name, feature_shape):

    label_encoder = None

    # loop over the data splits
    for split in (config.TRAIN, config.TEST, config.VAL):
        # grab all image paths in the current split
        p = os.path.sep.join([config.MODEL_DATASET_PATH, split])
        imagePaths = list(paths.list_images(p))

        # randomly shuffle the image paths and then extract the class labels
        # from the file paths.  It is more efficient to shuffle the classes
        # now, instead of during the training;
        random.shuffle(imagePaths)
        # get the labels in the same order as the random image paths
        # path/dataset/training/nonfood/0_123.jpb
        # index of -2 references 'nonfood'
        labels = [imagePath.split(os.path.sep)[-2] for imagePath in imagePaths]

        # if the label encoder is None, create it
        if label_encoder is None:
            label_encoder = LabelEncoder()
            label_encoder.fit(labels)

        # open the output CSV file for writing
        Path(config.BASE_CSV_PATH.format(model_name)).mkdir(parents=True,
                                                            exist_ok=True)
        csvPath = os.path.sep.join(
            [config.BASE_CSV_PATH.format(model_name), f"{split}.csv"])
        csv = open(csvPath, "w")

        # loop over the images in batches that match the batchsize
        # for the model
        # will feed the image through the model in batches
        # to get the resulting vector
        for (b, i) in enumerate(range(0, len(imagePaths), config.BATCH_SIZE)):
            # extract the batch of images and labels, then initialize the
            # list of actual images that will be passed through the network
            # for feature extraction
            logger.info(
                f"Processing batch {b+1}/{int(np.ceil(len(imagePaths)/float(config.BATCH_SIZE)))}"
            )
            batchPaths = imagePaths[i:i + config.BATCH_SIZE]
            batchLabels = label_encoder.transform(labels[i:i +
                                                         config.BATCH_SIZE])
            batchImages = []

            for imagePath in batchPaths:
                # load the input image using the keras helpt utility
                # while ensuring the image is resized to 224x224 pixels
                image = load_img(imagePath, target_size=(224, 224))
                image = img_to_array(image)

                # preprocess the image by:
                # 1 - expanding the dimensions because the model expects and array of array of image values
                #           and what image currently is, is a single array
                image = np.expand_dims(image, axis=0)
                # 2 - subracting the mean RGB pixel intensity from the ImageNet dataset
                image = imagenet_utils.preprocess_input(image)

                # add the image to the batch collection
                batchImages.append(image)

            # at this point we are ready to pass the image through the model network to extract the
            # features.  which in this case is an array/vector of size:  7*7*512

            # pass the images through the network nad use the outputs as
            # our actual features, then reshape the features into a flattened volume
            batchImages = np.vstack(batchImages)
            # recall our base_model has the front FCN layer REMOVED so we are getting the output
            # of the convolutional network.
            features = base_model.predict(batchImages,
                                          batch_size=config.BATCH_SIZE)
            # reshape features into an array of array
            features = features.reshape((features.shape[0], feature_shape))

            # loop over the class labels and extracted features
            for (label, vec) in zip(batchLabels, features):
                # construct a row that exists of the class label and extracted features
                vec = ",".join([str(v) for v in vec])
                csv.write(f"{label},{vec}\n")

        # close file
        csv.close()

    Path(config.LE_PATH.format(model_name)).mkdir(parents=True, exist_ok=True)
    f = open(config.LE_FILE.format(model_name), "wb")
    f.write(pickle.dumps(label_encoder))
    f.close()
Esempio n. 44
0
    train_df[col + '_target_mean'] = train_df[col].map(temp_dict)
    test_df[col + '_target_mean'] = test_df[col].map(temp_dict)

# %% [code]
########################### Encode Str columns
for col in list(train_df):
    if train_df[col].dtype == 'O':
        print(col)
        train_df[col] = train_df[col].fillna('unseen_before_label')
        test_df[col] = test_df[col].fillna('unseen_before_label')

        train_df[col] = train_df[col].astype(str)
        test_df[col] = test_df[col].astype(str)

        le = LabelEncoder()
        le.fit(list(train_df[col]) + list(test_df[col]))
        train_df[col] = le.transform(train_df[col])
        test_df[col] = le.transform(test_df[col])

        train_df[col] = train_df[col].astype('category')
        test_df[col] = test_df[col].astype('category')

# %% [code]
########################### TransactionAmt

# Let's add some kind of client uID based on cardID and addr columns
# The value will be very specific for each client so we need to remove it
# from final feature. But we can use it for aggregations.
train_df['uid'] = train_df['card1'].astype(str) + '_' + train_df[
    'card2'].astype(str) + '_' + train_df['card3'].astype(
Esempio n. 45
0
        kind='bar'
    )  #The line is unreliable cause some countries end up not showing
    plt.show()


def yearCrisis():
    pd.crosstab(x.country, y).plot(
        kind='bar'
    )  #The line is unreliable cause some countries end up not showing
    plt.show()


'''IN ORDER To handle our data we shall replace the crisis no_crisis values in banking crisis
    to 0 and 1's
'''
le = LabelEncoder()
x['country'] = le.fit_transform(
    x['country'])  #OUR COUNTRIES WOULD BE LABELLLED FROM 0 TO 13
y = le.fit_transform(y)  #SAME GOES TO CRISIS NO_CRISIS values

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=1000,
                                                    random_state=0)
#print(y_test)

#LOGISISTIC REGRESSION

lr = LogisticRegression()

lr.fit(x_train, y_train)
Esempio n. 46
0
    def create_handler():

        train_x = []
        train_y = []
        train_idx = []
        test_x = []
        test_y = []
        test_idx = []

        print(csv_original)

        if (problem_type.active == 1):
            notifier.text = """ Making DB - Regression """
        else:
            notifier.text = """ Making DB - Classification """

        if (len(param_key.value) == 0):

            slide = round(slider_window.value)

            xs = csv_original[param_x.value].values
            ys = csv_original[param_y.value].values

            train_ratio = round(slider_train_ratio.value)

            if (train_ratio != 0):

                train_x_set = xs[:xs.shape[0] * train_ratio / 100]
                train_y_set = ys[:ys.shape[0] * train_ratio / 100]

                for start, stop in zip(range(0, train_x_set.shape[0] - slide),
                                       range(slide, train_x_set.shape[0])):
                    train_x.append(train_x_set[start:start + slide])
                    train_y.append(train_y_set[start:start + slide][-1])
                    train_idx.append('0')

                train_x = np.asarray(train_x)
                train_x = np.swapaxes(train_x, 1, 2)
                train_x = np.expand_dims(train_x, -1)
                train_idx = np.asarray(train_idx)

            if (train_ratio != 100):

                test_x_set = xs[xs.shape[0] * train_ratio / 100:]
                test_y_set = ys[ys.shape[0] * train_ratio / 100:]

                for start, stop in zip(range(0, test_x_set.shape[0] - slide),
                                       range(slide, test_x_set.shape[0])):
                    test_x.append(test_x_set[start:start + slide])
                    test_y.append(test_y_set[start:start + slide][-1])
                    test_idx.append('1')

                test_x = np.asarray(test_x)
                test_x = np.swapaxes(test_x, 1, 2)
                test_x = np.expand_dims(test_x, -1)
                test_idx = np.asarray(test_idx)

            train_x = np.asarray(train_x)
            test_x = np.asarray(test_x)

            all_y = train_y + test_y

            data_train = {}
            data_test = {}

            if (problem_type.active == 1):
                train_y = np.asarray(train_y)
                train_y = np.expand_dims(train_y, -1)

                test_y = np.asarray(test_y)
                test_y = np.expand_dims(test_y, -1)

            elif (problem_type.active == 0):
                encoder = LabelEncoder()
                encoder.fit(all_y)
                encoded_y = encoder.transform(all_y)

                category_y = np_utils.to_categorical(encoded_y)

                labels = []
                for y in all_y:
                    if (y not in labels):
                        labels.append(y)

                data_train['labels'] = np.asarray(labels)
                data_test['labels'] = np.asarray(labels)

                train_y = category_y[:train_x.shape[0]]
                test_y = category_y[train_x.shape[0]:]

            data_train['x'] = train_x
            data_train['y'] = train_y
            data_train['key1'] = train_idx
            data_train['slideing_window'] = slider_window.value

            data_test['x'] = test_x
            data_test['y'] = test_y
            data_test['key1'] = test_idx
            data_test['slideing_window'] = slider_window.value

            if (problem_type.active == 1):
                print("Regression")
                target_dir = 'Regression/'
            elif (problem_type.active == 0):
                print("Classification")
                target_dir = 'Classification/'

            time_window = '[' + str(round(slider_window.value)) + ']'

            if (train_ratio != 0):
                np.save(
                    "./np/" + target_dir + time_window + text_title.value +
                    "_train.npy", data_train)
            if (train_ratio != 100):
                np.save(
                    "./np/" + target_dir + time_window + text_title.value +
                    "_test.npy", data_test)

        elif (len(param_key.value) == 1):

            key1_list = list(csv_original[param_key.value[0]].unique())

            train_ratio = round(slider_train_ratio.value)
            if (train_ratio == 0):
                train_key = []
                test_key = key1_list[int(len(key1_list) * train_ratio / 100):]
            elif (train_ratio == 100):
                train_key = key1_list[:int(len(key1_list) * train_ratio / 100)]
                test_key = []
            else:
                train_key = key1_list[:int(len(key1_list) * train_ratio / 100)]
                test_key = key1_list[int(len(key1_list) * train_ratio / 100):]

            for key in train_key:

                num_elements = csv_original[csv_original[param_key.value[0]] ==
                                            key].shape[0]
                slide = round(slider_window.value)

                if (num_elements < slide):
                    continue

                xs = csv_original[csv_original[param_key.value[0]] == key][
                    param_x.value].values
                ys = csv_original[csv_original[param_key.value[0]] == key][
                    param_y.value].values

                for start, stop in zip(range(0, num_elements - slide),
                                       range(slide, num_elements)):
                    train_x.append(xs[start:start + slide])
                    train_y.append(ys[start:start + slide][-1])
                    train_idx.append(key)

            for key in test_key:
                num_elements = csv_original[csv_original[param_key.value[0]] ==
                                            key].shape[0]
                slide = round(slider_window.value)

                if (num_elements < slide):
                    continue

                xs = csv_original[csv_original[param_key.value[0]] == key][
                    param_x.value].values
                ys = csv_original[csv_original[param_key.value[0]] == key][
                    param_y.value].values

                for start, stop in zip(range(0, num_elements - slide),
                                       range(slide, num_elements)):
                    test_x.append(xs[start:start + slide])
                    test_y.append(ys[start:start + slide][-1])
                    test_idx.append(key)

            all_y = train_y + test_y

            train_x = np.asarray(train_x)
            if (train_ratio != 0):
                train_x = np.swapaxes(train_x, 1, 2)
                train_x = np.expand_dims(train_x, -1)
            train_idx = np.asarray(train_idx)

            test_x = np.asarray(test_x)
            if (train_ratio != 100):
                test_x = np.swapaxes(test_x, 1, 2)
                test_x = np.expand_dims(test_x, -1)
            test_idx = np.asarray(test_idx)

            data_train = {}
            data_test = {}

            if (problem_type.active == 1):
                train_y = np.asarray(train_y)
                train_y = np.expand_dims(train_y, -1)

                test_y = np.asarray(test_y)
                test_y = np.expand_dims(test_y, -1)

            elif (problem_type.active == 0):
                encoder = LabelEncoder()
                encoder.fit(all_y)
                encoded_y = encoder.transform(all_y)

                category_y = np_utils.to_categorical(encoded_y)

                labels = []
                for y in all_y:
                    if (y not in labels):
                        labels.append(y)

                data_train['labels'] = np.asarray(labels)
                data_test['labels'] = np.asarray(labels)

                train_y = category_y[:train_x.shape[0]]
                test_y = category_y[train_x.shape[0]:]

            data_train['x'] = train_x
            data_train['y'] = train_y
            data_train['key1'] = train_idx
            data_train['slideing_window'] = slider_window.value

            data_test['x'] = test_x
            data_test['y'] = test_y
            data_test['key1'] = test_idx
            data_test['slideing_window'] = slider_window.value

            print(train_x.shape)
            print(train_y.shape)
            print(test_x.shape)
            print(test_y.shape)

            if (problem_type.active == 1):
                print("Regression")
                target_dir = 'Regression/'
            elif (problem_type.active == 0):
                print("Classification")
                target_dir = 'Classification/'

            print(train_x.shape)

            time_window = '[' + str(round(slider_window.value)) + ']'

            if (train_ratio == 0):
                np.save(
                    "./np/" + target_dir + time_window + text_title.value +
                    "_test.npy", data_test)
            elif (train_ratio == 100):
                np.save(
                    "./np/" + target_dir + time_window + text_title.value +
                    "_train.npy", data_train)
            else:
                np.save(
                    "./np/" + target_dir + time_window + text_title.value +
                    "_train.npy", data_train)
                np.save(
                    "./np/" + target_dir + time_window + text_title.value +
                    "_test.npy", data_test)

        elif (len(param_key.value) == 2):
            keys_list = csv_original[param_key.value].drop_duplicates()

            train_ratio = round(slider_train_ratio.value)
            if (train_ratio == 0):
                train_key = []
                test_key = keys_list.iloc[
                    int(len(keys_list) * slider_train_ratio.value / 100):]
            elif (train_ratio == 100):
                train_key = keys_list.iloc[:int(
                    len(keys_list) * slider_train_ratio.value / 100)]
                test_key = []
            else:
                train_key = keys_list.iloc[:int(
                    len(keys_list) * slider_train_ratio.value / 100)]
                test_key = keys_list.iloc[
                    int(len(keys_list) * slider_train_ratio.value / 100):]

            if (train_ratio != 0):
                for index, row in train_key.iterrows():

                    key1 = row[param_key.value[0]]
                    key2 = row[param_key.value[1]]

                    cond1 = csv_original[param_key.value[0]] == key1
                    cond2 = csv_original[param_key.value[0]] == key2

                    csv_target = csv_original[cond1 & cond2]

                    num_elements = csv_target.shape[0]
                    if (num_elements < slider_window.value):
                        continue

                    xs = csv_target[param_x.value].values
                    ys = csv_target[param_y.value].values

                    for start, stop in zip(
                            range(0, num_elements - slider_window.value),
                            range(slider_window.value, num_elements)):
                        train_x.append(xs[start:start + slider_window.value])
                        train_y.append(ys[start:start +
                                          slider_window.value][-1])
                        train_idx.append(str(key1) + "_" + str(key2))

            if (train_ratio != 100):
                for index, row in test_key.iterrows():

                    key1 = row[param_key.value[0]]
                    key2 = row[param_key.value[1]]

                    cond1 = csv_original[param_key.value[0]] == key1
                    cond2 = csv_original[param_key.value[1]] == key2

                    csv_target = csv_original[cond1 & cond2]

                    num_elements = csv_target.shape[0]
                    if (num_elements < slider_window.value):
                        continue

                    xs = csv_target[param_x.value].values
                    ys = csv_target[param_y.value].values

                    for start, stop in zip(
                            range(0, num_elements - slider_window.value),
                            range(slider_window.value, num_elements)):
                        test_x.append(xs[start:start + slider_window.value])
                        test_y.append(ys[start:start +
                                         slider_window.value][-1])
                        test_idx.append(str(key1) + "_" + str(key2))

            all_y = train_y + test_y

            train_x = np.asarray(train_x)
            if (train_ratio != 0):
                train_x = np.swapaxes(train_x, 1, 2)
                train_x = np.expand_dims(train_x, -1)
            train_idx = np.asarray(train_idx)

            test_x = np.asarray(test_x)
            if (train_ratio != 100):
                test_x = np.swapaxes(test_x, 1, 2)
                test_x = np.expand_dims(test_x, -1)
            test_idx = np.asarray(test_idx)

            data_train = {}
            data_test = {}

            if (problem_type.active == 1):
                train_y = np.asarray(train_y)
                train_y = np.expand_dims(train_y, -1)

                test_y = np.asarray(test_y)
                test_y = np.expand_dims(test_y, -1)

            elif (problem_type.active == 0):
                encoder = LabelEncoder()
                encoder.fit(all_y)
                encoded_y = encoder.transform(all_y)

                category_y = np_utils.to_categorical(encoded_y)

                labels = []
                for y in all_y:
                    if (y not in labels):
                        labels.append(y)

                data_train['labels'] = np.asarray(labels)
                data_test['labels'] = np.asarray(labels)

                train_y = category_y[:train_x.shape[0]]
                test_y = category_y[train_x.shape[0]:]

            data_train['x'] = train_x
            data_train['y'] = train_y
            data_train['key1'] = train_idx
            data_train['slideing_window'] = slider_window.value

            data_test['x'] = test_x
            data_test['y'] = test_y
            data_test['key1'] = test_idx
            data_test['slideing_window'] = slider_window.value

            print(train_x.shape)
            print(train_y.shape)
            print(test_x.shape)
            print(test_y.shape)

            if (problem_type.active == 1):
                print("Regression")
                target_dir = 'Regression/'
            elif (problem_type.active == 0):
                print("Classification")
                target_dir = 'Classification/'

            print(train_x.shape)

            time_window = '[' + str(round(slider_window.value)) + ']'

            if (train_ratio == 0):
                np.save(
                    "./np/" + target_dir + time_window + text_title.value +
                    "_test.npy", data_test)
            elif (train_ratio == 100):
                np.save(
                    "./np/" + target_dir + time_window + text_title.value +
                    "_train.npy", data_train)
            else:
                np.save(
                    "./np/" + target_dir + time_window + text_title.value +
                    "_train.npy", data_train)
                np.save(
                    "./np/" + target_dir + time_window + text_title.value +
                    "_test.npy", data_test)

        notifier.text = """ DB creation complete """
Esempio n. 47
0
def train_log_reg(filename, sentiment_words_file, seed=42):
    train_df = load_finphrase(filename)

    # Samples
    pd.set_option("display.max_colwidth", -1)
    logging.debug(train_df.sample(n=20, random_state=seed))


    # Encode the label
    le = LabelEncoder()
    le.fit(train_df["label"])
    train_df["label"] = le.transform(train_df["label"])
    logging.debug(list(le.classes_))
    logging.debug(train_df["label"])

    corpus = create_corpus(train_df)
    # visualize_frequent_words(corpus, stop_words)
    # generate_word_cloud(corpus, stop_words)

    # Load sentiment data
    sentiment_df = pd.read_csv(sentiment_words_file)

    # Make all words lower case
    sentiment_df["word"] = sentiment_df["word"].str.lower()
    sentiments = sentiment_df["sentiment"].unique()
    sentiment_df.groupby(by=["sentiment"]).count()

    sentiment_dict = {
        sentiment: sentiment_df.loc[sentiment_df["sentiment"] == sentiment][
            "word"
        ].values.tolist()
        for sentiment in sentiments
    }


    columns = [
        "tone_score",
        "word_count",
        "n_pos_words",
        "n_neg_words",
        "pos_words",
        "neg_words",
    ]

    # Analyze tone for original text dataframe
    print(train_df.shape)
    tone_lmdict = [
        tone_count_with_negation_check(sentiment_dict, x.lower())
        for x in tqdm(train_df["sentence"], total=train_df.shape[0])
    ]
    tone_lmdict_df = pd.DataFrame(tone_lmdict, columns=columns)
    train_tone_df = pd.concat([train_df, tone_lmdict_df.reindex(train_df.index)], axis=1)
    train_tone_df.head()

    # Show corelations to next_decision
    plt.figure(figsize=(10, 6))
    corr_columns = ["label", "n_pos_words", "n_neg_words"]
    sns.heatmap(
        train_tone_df[corr_columns].astype(float).corr(),
        cmap="coolwarm",
        annot=True,
        fmt=".2f",
        vmin=-1,
        vmax=1,
    )
    # plt.show()

    # X and Y data used
    Y_data = train_tone_df["label"]
    X_data = train_tone_df[["tone_score", "n_pos_words", "n_neg_words"]]

    # Train test split (Shuffle=False will make the test data for the most recent ones)
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
        X_data.values, Y_data.values, test_size=0.2, shuffle=True
    )

    # Tokenize
    tokenized, tokenized_text, bow, vocab, id2vocab, token_ids = tokenize_df(
        train_tone_df, col="sentence", lemma=True, stopwords=True, tokenizer="NLTK"
    )
    sns.distplot([len(x) for x in tokenized_text])

    # X and Y data used
    Y_data = train_tone_df["label"]
    X_data = tokenized_text

    # Train test split (Shuffle=False will make the test data for the most recent ones)
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
        X_data, Y_data.values, test_size=0.2, shuffle=True
    )


    pipeline = Pipeline(
        [("vec", TfidfVectorizer(analyzer="word")), ("clf", LogisticRegression())]
    )

    pipeline.fit(X_train, Y_train)

    pred_train = pipeline.predict(X_train)
    pred_test = pipeline.predict(X_test)

    # Define metrics
    # Here, use F1 Macro to evaluate the model.
    def metric(y_true, y_pred):
        acc = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average="macro")
        return acc, f1


    acc, f1 = metric(Y_train, pred_train)
    logging.info("Training - acc: %.8f, f1: %.8f" % (acc, f1))
    acc, f1 = metric(Y_test, pred_test)
    logging.info("Test - acc: %.8f, f1: %.8f" % (acc, f1))
    return pipeline
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv'
dataframe = read_csv(url, header=None)
data = dataframe.values
# separate into input and output elements
X, y = data[:, :-1], data[:, -1]
# minimally prepare dataset
X = X.astype('float')
y = LabelEncoder().fit_transform(y.astype('str'))
# define the modeling pipelinw
model = LogisticRegression(solver='liblinear')
#Here, we are normalizing!
scaler = MinMaxScaler()
#Pipeline is done just so that we scale after splitting, in this case, we are doing cross validation
pipeline = Pipeline([('s',scaler),('m',model)])
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model
m_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize the result
print('Accuracy: %.3f (%.3f)' % (mean(m_scores), std(m_scores)))
class DistanceClassifier(BaseEstimator):

    """Multifactor Dimensionality Reduction (DistanceClassifier) for feature construction in machine learning"""

    def __init__(self, d='mahalanobis'):
        """Sets up the DistanceClassifier algorithm

        Parameters
        ----------
        d: ('mahalanobis' or 'euclidean')
            Type of distance calculation to use

        Returns
        -------
        None

        """
        # Save params to be recalled later by get_params()
        self.params = locals()  # Must be placed before any local variable definitions
        self.params.pop('self')

        self.d = d
        self.mu = None
        self.Z = None
        self.le = LabelEncoder()


    def fit(self, features, classes):
        """Constructs the DistanceClassifier from the provided training data

        Parameters
        ----------
        features: array-like {n_samples, n_features}
            Feature matrix
        classes: array-like {n_samples}
            List of class labels for prediction

        Returns
        -------
        None

        """
        # class labels
        classes = self.le.fit_transform(classes)

        # group the data by class label
        X = []
        self.mu = []
        self.Z = []
        for i in np.unique(classes):
            X.append(features[classes == i])
            self.mu.append(np.mean(X[i],axis=0))
            if self.d == 'mahalanobis':
                self.Z.append(np.cov(X[i].transpose()))

        return self

    def predict(self, features):
        """Predict class outputs for an unlabelled feature set"""

        # get distance of features to class clusters
        distances = [self._distance(x) for x in features]

        # assign class label belonging to smallest distance
        class_predict = [np.argmin(d) for d in distances]

        return self.le.inverse_transform(class_predict)

    def _distance(self,x):
        """returns distance measures for features"""
        distance = np.empty([len(self.mu)])

        for i in np.arange(len(self.mu)):
            if self.d == 'mahalanobis' and self.is_invertible(self.Z[i]):
                    distance[i] = (x - self.mu[i]).dot(np.linalg.inv(self.Z[i])).dot((x - self.mu[i]).transpose())
            else:
                distance[i] = (x - self.mu[i]).dot((x - self.mu[i]).transpose())

        return distance
    def fit_predict(self, features, classes):
        """Convenience function that fits the provided data then predicts the class labels
        Parameters
        ----------
        features: array-like {n_samples, n_features}
            Feature matrix
        classes: array-like {n_samples}
            List of true class labels

        Returns
        ----------
        array-like: {n_samples}
            Constructed features from the provided feature matrix

        """
        self.fit(features, classes)
        return self.predict(features)

    def score(self, features, classes, scoring_function=accuracy_score, **scoring_function_kwargs):
        """Estimates the accuracy of the predictions from the constructed feature

        Parameters
        ----------
        features: array-like {n_samples, n_features}
            Feature matrix to predict from
        classes: array-like {n_samples}
            List of true class labels

        Returns
        -------
        accuracy_score: float
            The estimated accuracy based on the constructed feature

        """
        if not self.mu:
            raise ValueError('The DistanceClassifier model must be fit before score() can be called')

        return scoring_function(classes, self.predict(features), **scoring_function_kwargs)

    def get_params(self, deep=None):
        """Get parameters for this estimator

        This function is necessary for DistanceClassifier to work as a drop-in feature constructor in,
        e.g., sklearn.cross_validation.cross_val_score

        Parameters
        ----------
        deep: unused
            Only implemented to maintain interface for sklearn

        Returns
        -------
        params: mapping of string to any
            Parameter names mapped to their values
        """
        return self.params

    def is_invertible(self,X):
        """checks if Z is invertible"""
        if len(X.shape) == 2:
            return X.shape[0] == X.shape[1] and np.linalg.matrix_rank(X) == X.shape[0]
        else:
            return False
Esempio n. 50
0
cols = list(cat_df)
fig, axes = plt.subplots(nrows=2, ncols=2)

for i in range(0, 2):
    for j in range(0, 2):
        sns.countplot(x=X_train[cols[i * 2 + j]], hue=y_train, ax=axes[i, j])

# --------------
#Importing header files
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
# Code starts here

for i in list(cat_df):
    X_train[i].fillna('NA')
    le = LabelEncoder()
    X_train[i] = le.fit_transform(X_train[i])

    X_test[i].fillna('NA')
    le = LabelEncoder()
    X_test[i] = le.fit_transform(X_test[i])

#y_test = y_test.str.replace('No',0)
y_train.replace({'No': 0, 'Yes': 1}, inplace=True)
y_test.replace({'No': 0, 'Yes': 1}, inplace=True)
# Code ends here

from sklearn.metrics import accuracy_score
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
target = pickle.load(open("../generated/group.p", "r"))
device_id = pickle.load(open("../generated/device_id.p", "r"))

trainDevices = pd.read_csv("../../../data/gender_age_train.csv",
                           usecols=["device_id"])
indexes = pd.read_csv("../generated/raddarIndices.csv")
indexes = pd.merge(trainDevices,
                   indexes,
                   how="left",
                   on="device_id",
                   left_index=True).reset_index().drop(["index"], axis=1)
##################
#   Pre Processing
##################
targetEncoder = LabelEncoder()
target = targetEncoder.fit_transform(target)
skfTarget = target.copy()
target = np_utils.to_categorical(target)


##################
#  Build Model
##################
def modelBuilder():
    model = Sequential()
    model.add(
        Dense(200, input_dim=train.shape[1], init='normal', activation="tanh"))
    model.add(Dropout(0.4))
    model.add(Dense(70, input_dim=150, init='normal'))
    model.add(PReLU())
Esempio n. 52
0

df['Sentence'] = df['Sentence'].map(lambda x: clean_text(x))

# In[12]:

X, y = df['Sentence'], df['Emotions']

# ### Ex5: Transform y to one-hot-encoding

# In[15]:

onehot_y = None
# YOUR CODE HERE
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)
onehot_y = np_utils.to_categorical(encoded_y)

# In[16]:

vocab = set()
a = [vocab.add(el) for s in X.values for el in s.split(' ')]
print("Total Unique words:", len(a))

# In[17]:

l = [len(s) for s in X.values]
counts = Counter(l)
plt.bar(counts.keys(), counts.values())
Esempio n. 53
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
import numpy as np

np.set_printoptions(precision=2)

# Carrega a base de dados sonar.
sonar = pd.read_excel('../Datasets/sonar.xlsx', sheetname=0)

X = sonar.iloc[:, 0:(sonar.shape[1] - 1)]

le = LabelEncoder()
y = le.fit_transform(sonar.iloc[:, (sonar.shape[1] - 1)])

class_names = le.classes_

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)

# Grid Search
# Seleciona os parâmetros da SVM que deseja testar
params = [{
    'kernel': ['rbf'],
    'gamma': [1e-3, 1e-4],
    'C': [1, 10, 100, 1000]
Esempio n. 54
0
# Load the dataset
dataset = pd.read_csv("../../Datasets/RAVDESS/speechActorDataset.csv")

# Split features and labels
datasetCopy = dataset
labels = datasetCopy['label']
features = datasetCopy.drop(columns='label')

# Train Test Split
trainIndex = int(len(features) * 0.7)
train_features = features[:trainIndex]
train_labels = labels[:trainIndex]
test_features = features[trainIndex + 1:-1]
test_labels = labels[trainIndex + 1:-1]

lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(train_labels))
y_test = np_utils.to_categorical(lb.fit_transform(test_labels))

# Changing Dimension for CNN model
x_traincnn = np.expand_dims(train_features, axis=2)
x_testcnn = np.expand_dims(test_features, axis=2)

model = Sequential()

model.add(Conv1D(256, 5, padding='same', input_shape=(216, 1)))
model.add(Activation('relu'))
model.add(Conv1D(128, 5, padding='same'))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(MaxPooling1D(pool_size=(8)))
Esempio n. 55
0
class WeightAverageEnsembleClassifier(BaseEstimator, ClassifierMixin):
    """
    多数決 or 重み付け平均化でのアンサンブルモデルの識別器 classifier の自作クラス.
    scikit-learn ライブラリの推定器 estimator の基本クラス BaseEstimator, ClassifierMixin を継承している.
    """
    def __init__(self,
                 classifiers,
                 weights=[],
                 train_modes=[],
                 vote_method="probability_vote",
                 clone=False):
        """
        Args :
            classifiers : list <classifier オブジェクト>
                分類器のクラスのオブジェクトのリスト
            weights : list <float>
                各分類器の対する重みの値のリスト : __init()__ の引数と同名のオブジェクトの属性
            vote_method : str ( "majority_vote" or "probability_vote" )
                アンサンブルによる最終的な判断判断手法 : __init()__ の引数と同名のオブジェクトの属性
                "majority_vote"    : 弱識別器の多数決で決定する.多数決方式 (=クラスラベルの argmax() 結果)
                "probability_vote" : 弱識別器の確率値での重み付け結果で決定する.(=クラスの所属確率の argmax() 結果)
        """
        self.classifiers = classifiers
        self.fitted_classifiers = classifiers
        self.weights = weights
        self.n_classes = 0
        self.n_classifier = len(classifiers)
        self.train_modes = train_modes
        self.vote_method = vote_method
        self.clone = clone
        self.encoder = LabelEncoder()

        # classifiers で指定した各オブジェクトの名前
        if classifiers != None:
            self.named_classifiers = {
                key: value
                for key, value in _name_estimators(classifiers)
            }
        else:
            self.named_classifiers = {}

        for i, named_classifier in enumerate(self.named_classifiers):
            print("name {} : {}".format(
                i, self.named_classifiers[named_classifier]))

        if (self.train_modes == []):
            for c in range(len(self.classifiers)):
                train_modes.append("train")

        return

    def fit(self, X_train, y_train, X_valid=None, y_valid=None):
        # LabelEncoder クラスを使用して, クラスラベルが 0 から始まるようにする.これは, self.predict() 関数内の np.argmax() 関数呼び出し時に重要となるためである.
        self.encoder.fit(y_train)
        y_train = self.encoder.transform(y_train)
        self.n_classes = self.encoder.classes_

        # self.classifiers に設定されている分類器のクローン clone(clf) で fitting
        self.fitted_classifiers = []
        for c, clf in enumerate(self.classifiers):
            if (self.train_modes[c] == "train"):
                if (self.clone):
                    # clone() : 同じパラメータの 推定器を生成
                    fitted_clf = clone(clf).fit(X_train, y_train, X_valid,
                                                y_valid)
                else:
                    fitted_clf = clf.fit(X_train, y_train, X_valid, y_valid)
            else:
                fitted_clf = clf

            self.fitted_classifiers.append(fitted_clf)

        return self

    def predict(self, X_test):
        #------------------------------------------------------------------------------------------------------
        # アンサンブルの最終決定方式 vote_method が, 各弱識別器の重み付け方式 "probability_vote" のケース
        #------------------------------------------------------------------------------------------------------
        if self.vote_method == "probability_vote":
            # np.argmax() : 指定した配列の最大要素のインデックスを返す
            # axis : 最大値を読み取る軸の方向 ( axis = 1 : shape が2次元配列 行方向)
            vote_results = np.argmax(self.predict_proba(X_test), axis=1)

        #------------------------------------------------------------------------------------------------------
        # アンサンブルの最終決定方式 vote_method が, 多数決方式 "majority_vote" のケース
        #------------------------------------------------------------------------------------------------------
        else:
            # 各弱識別器 clf の predict() 結果を predictions (list) に格納
            predictions = [
                clf.predict(X_test) for clf in self.fitted_classifiers
            ]
            """            
            for i in range(len(predictions)):
                print( "predictions[{}].shape : {}".format(i, predictions[i].shape) )
                print( "predictions[{}][0:5] : {}".format(i, predictions[i][0:5]) )
            """

            # predictions を 転置し, 行と列 (shape) を反転
            predictions = np.asarray(predictions).T

            # 各サンプルの所属クラス確率に重み付けで足し合わせた結果が最大となるようにし、列番号を返すようにする.
            vote_results = np.apply_along_axis(
                lambda x: np.argmax(np.bincount(x, weights=self.weights)),
                axis=1,
                arr=predictions)

        # vote_results を LabelEncoder で逆行列化して, shape を反転
        vote_results = self.encoder.inverse_transform(vote_results)
        return vote_results

    def predict_proba(self, X_test):
        # 各弱識別器 clf の predict_prpba() 結果を predictions (list) に格納
        predict_probas = []
        for clf in self.fitted_classifiers:
            predict_proba = clf.predict_proba(X_test)
            predict_probas.append(predict_proba)
            #print( "predict_proba.shape : ", predict_proba.shape )  # shape = [n_classifer, n_features]

        predict_probas = np.asarray(predict_probas)

        # 平均化
        ave_probas = np.average(predict_probas, axis=0, weights=self.weights)
        #print( "EnsembleLearningClassifier.predict_proba() { ave_probas } : \n", ave_probas )

        return ave_probas
Esempio n. 56
0
Created on Sun Feb 17 13:33:36 2019

@author: sidha
"""
## Importing The Libraries
import pandas as pd
import matplotlib.pyplot as plt

## Importing a File
dataset = pd.read_csv("Iris.csv")
X = dataset.iloc[:, 1:5].values
Y = dataset.iloc[:, -1].values

## Encoding the Categorical Variable
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

## Visulaising
## Parallel Coordinates
from pandas.plotting import parallel_coordinates
plt.figure(figsize=(15, 10))
parallel_coordinates(dataset.drop("Id", axis=1), "Species")
plt.title("Parallel Coordinates Plot", fontsize=20, fontweight="bold")
plt.xlabel("Features", fontsize=15)
plt.ylabel("Features Values", fontsize=15)
plt.legend(loc=1,
           prop={"size": 15},
           frameon=True,
           shadow=True,
           facecolor="White",
X[0]


# In[6]:


Y[0]


# ## Preprocess the Data

# In[7]:


from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
le_Y = LabelEncoder()
Y = le_Y.fit_transform(Y)
Y = Y.reshape(len(Y), 1)
ohe = OneHotEncoder(categorical_features=[0])
Y = ohe.fit_transform(Y).toarray()
Y[0]


# In[8]:


sc_X = StandardScaler()
X = sc_X.fit_transform(X)


# In[9]:
Esempio n. 58
0
 def __encode_data(self, dataframe, label_to_encode):
     y = dataframe[label_to_encode]
     encoder = LabelEncoder()
     y = encoder.fit_transform(y)
     return y
Esempio n. 59
0
"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
"""## Preprocess"""

dataset = pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
X

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Country
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
# Sex : male, female
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
# encoded numeric values -> categorical
onehotencoder = OneHotEncoder(categorical_features=[1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]
"""### dataset"""

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
Esempio n. 60
0
def main():
    train_0 = pd.read_csv('train.csv')
    test_0 = pd.read_csv('test.csv')
    #print(train_0.head(10))

    header = [
        'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
        'MaritalStatus', 'OverTime'
    ]

    # 删除无用特征
    user_id = test_0['user_id']
    train_0 = train_0.drop(['user_id', 'EmployeeCount', 'Over18'], axis=1)
    test_0 = test_0.drop(['user_id', 'EmployeeCount', 'Over18'], axis=1)

    #特征编码
    for index in header:
        LE = LabelEncoder()
        train_0[index] = LE.fit_transform(train_0[index])
        test_0[index] = LE.transform(test_0[index])
    LE = LabelEncoder()
    label_0 = LE.fit_transform(train_0['Attrition'])
    train_0 = train_0.drop(['Attrition'], axis=1)
    train_x, train_y, label_x, label_y = train_test_split(train_0,
                                                          label_0,
                                                          test_size=0.3,
                                                          random_state=1)
    # 标准化

    # LGBM 调参

    parameters = {
        'max_depth': [15, 20, 25],
        'learning_rate': [0.01, 0.05],
        'feature_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
        'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
        'bagging_freq': [2, 4, 5, 6, 8],
        'lambda_l1': [0.6, 0.7, 0.8],
        'lambda_l2': [0, 15, 35],
        'cat_smooth': [1, 10, 15]
    }

    LGB = lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        metric='auc',
        verbose=0,
        learning_rate=0.01,
        num_leaves=35,
        feature_fraction=0.8,
        bagging_fraction=0.7,
        bagging_freq=2,
        lambda_l1=0.8,
        lambda_l2=0,
        max_depth=15,
        #silent = False
        cat_smooth=1)
    # gsearch = GridSearchCV(LGB, param_grid=parameters, scoring='roc_auc', cv = 3)
    # gsearch.fit(train_0, label_0)
    #
    # print("Best score: %0.3f" % gsearch.best_score_)
    # print("Best parameters set:")
    # best_parameters = gsearch.best_estimator_.get_params()
    # for param_name in sorted(parameters.keys()):
    #     print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # LGB.fit(train_0, label_0)
    # predict = LGB.predict_proba(test_0)[:,1]
    #
    # test_0['Attrition'] = predict
    # test_0['user_id'] = user_id
    # test_0[['user_id','Attrition']].to_csv('submit_lgb.csv', index = False)

    LGB.fit(train_x, label_x)
    predict = LGB.predict_proba(train_y)[:, 1]
    print("LGB auc:%0.6lf" % metrics.roc_auc_score(label_y, predict))

    SVM = SVC(kernel='rbf', probability=True, C=0.2)
    SVM.fit(train_x, label_x)
    predict_svm = SVM.predict_proba(train_y)[:, 1]
    print("SVM auc:%0.6lf" % metrics.roc_auc_score(label_y, predict_svm))

    CAT = cat.CatBoostClassifier()
    CAT.fit(train_x, label_x)
    predict_svm = CAT.predict_proba(train_y)[:, 1]
    print("cat auc:%0.6lf" % metrics.roc_auc_score(label_y, predict_svm))

    NG = ng.NGBClassifier()
    NG.fit(train_x, label_x)
    predict_ng = NG.pred_dist(train_y)
    predict_ng = predict_ng.probs[1, :]
    print("NG auc:%0.6lf" % metrics.roc_auc_score(label_y, predict_ng))