Esempio n. 1
0
	def __init__(self,
				cache_size=500, tol=0.01, kernel="rbf",
				skewedness=0.0005, gamma=1/40,
				use_SGD=False, n_iter=25, alpha_g=0.001, alpha_all=0.005): # last line is only for SGD
		self.groups = list(grouping.get_group2class().keys())

		self.cache_size = cache_size
		self.tol = tol
		self.kernel = kernel
		self.use_SGD = use_SGD
		self.n_iter = n_iter
		self.alpha_g = alpha_g
		self.alpha_all = alpha_all
		self.skewedness = skewedness
		self.gamma = gamma

		self.rbf_feature = RBFSampler(gamma=gamma, n_components=500)
		self.chi2_feature_0 = SkewedChi2Sampler(skewedness=skewedness, n_components=300)
		self.chi2_feature_1 = SkewedChi2Sampler(skewedness=skewedness, n_components=300)
		self.chi2_feature_2 = SkewedChi2Sampler(skewedness=skewedness, n_components=300)

		if (use_SGD):
			self.SVMs = [SGD(alpha=alpha_g, epsilon=0.1, n_iter=n_iter*4//5, n_jobs=4) for _ in self.groups]
			self.SVM_all = SGD(alpha=alpha_all, epsilon=0.1, n_iter=n_iter, n_jobs=4)
		else:
			self.SVMs = [SVC(cache_size=cache_size,tol=tol,kernel=kernel,C=500) for _ in self.groups]
			self.SVM_all = SVC(cache_size=cache_size,tol=tol,kernel=kernel,C=1000)
Esempio n. 2
0
    def models(self) -> [RegressorMixin]:
        if self._models is None:
            self._models = [LinearSVR(), SVR(), SVR(C=100), SVR(kernel='poly'),
                            LinearRegression(), Lasso(), ElasticNet(),
                            SGD(), SGD('epsilon_insensitive'), SGD('squared_epsilon_insensitive'),
                            Baseline(strategy='median')]

        return self._models if isinstance(self._models, list) else [self._models]
Esempio n. 3
0
File: sgd.py Progetto: j6e/hyperband
def try_params(n_iterations, params, data):
    n_iterations = int(round(n_iterations))
    print("n_iterations:", n_iterations)
    pprint(params)

    if params['scaler']:
        scaler = eval("{}()".format(params['scaler']))
        x_train_ = scaler.fit_transform(data['x_train'].astype(float))
        x_test_ = scaler.transform(data['x_test'].astype(float))

        local_data = {
            'x_train': x_train_,
            'y_train': data['y_train'],
            'x_test': x_test_,
            'y_test': data['y_test']
        }
    else:
        local_data = data

    # we need a copy because at the next small round the best params will be re-used
    params_ = dict(params)
    params_.pop('scaler')

    clf = SGD(n_iter=n_iterations, **params_)

    return train_and_eval_sklearn_classifier(clf, local_data)
Esempio n. 4
0
    def __init__(self, env):        
        self.env = env 
        
        # sampleing envrionment state in order to featurize it. 
        observation_examples = np.array([self.env.observation_space.sample() for x in range(10000)])
        
        # Feature Preprocessing: Normalize to zero mean and unit variance
        # We use a few samples from the observation space to do this
        self.scaler = Scaler()
        self.scaler.fit(observation_examples)
                
        # Used to convert a state to a featurizes represenation.
        # We use RBF kernels with different variances to cover different parts of the space
        self.featurizer = FeatureUnion([
                ("rbf1", RBF(gamma=5.0, n_components=100)),
                ("rbf2", RBF(gamma=2.0, n_components=100)),
                ("rbf3", RBF(gamma=1.0, n_components=100)),
                ("rbf4", RBF(gamma=0.5, n_components=100))
                ])
        self.featurizer.fit(self.scaler.transform(observation_examples))

        # action model for SGD regressor
        self.action_models = []
        self.nA = self.env.action_space.n
        
        for na in range(self.nA):
            model = SGD(learning_rate="constant")
            model.partial_fit([self.__featurize_state(self.env.reset())], [0])
            self.action_models.append(model)
Esempio n. 5
0
def createPipeline(df,tLabel,DROP_FIELDS):
    model = SGD(loss='squared_epsilon_insensitive',penalty='l2',alpha=0.001,n_iter=1500,epsilon=0.001,learning_rate ='invscaling',warm_start=False,shuffle=False)
    X = df.drop(DROP_FIELDS,axis=1).copy()
    y = df[tLabel].copy()
    X = X.drop(tLabel,axis=1)
    model = model.fit(X,y)
    return model
def get_sgd_model():
    sgd_params = {'alpha': [0.00006, 0.00007, 0.00008, 0.0001, 0.0005]}
    model_SGD = GridSearchCV(SGD(random_state=0,
                                 shuffle=True,
                                 loss='modified_huber'),
                             sgd_params,
                             scoring='roc_auc',
                             cv=20)
    return model_SGD
Esempio n. 7
0
    def SGD(self, alpha, penalty):
        model = SGD(alpha=alpha, penalty=penalty)
        model.fit(self.x_train, self.y_train)
        pred = model.predict(self.x_test)

        #scores3 = cross_val_score(model, self.x_train, self.y_train, cv=5, scoring='accuracy')
        #print("Score of LDA in Cross Validation", scores3.mean() * 100)

        print(" SGD : accurancy_is", metrics.accuracy_score(self.y_test, pred))
        return pred
def pred():
    # Load fitted training data
    trainAfterFit = pickle.load(open("../picks/fittedTrainData.pkl", "rb"))
    # Load prediction column
    predCol = pickle.load(open("../picks/predCol", "rb"))
    # Load fitted test data
    testAfterFit = pickle.load(open("../picks/fittedTestData.pkl", "rb"))
    # Load test data
    test = pd.read_csv('../data/testData.tsv',
                       header=0,
                       delimiter="\t",
                       quoting=3)

    # Constant that multiplies the regularization term. Defaults to 0.0001
    sgd_params = {'alpha': [0.00006, 0.00007, 0.00008, 0.0001, 0.0005]}
    # Initialize SGD classifier
    modelSGD = GridSearchCV(
        SGD(
            random_state=
            0,  # The seed of the pseudo random number generator to use when shuffling the data.
            shuffle=
            True,  # Whether or not the training data should be shuffled after each epoch. Defaults to True.
            loss='modified_huber'

            # The loss function to be used. Defaults to 'hinge', which gives a linear SVM.
            # The 'log' loss gives logistic regression, a probabilistic classifier.
            # 'modified_huber' is another smooth loss that brings tolerance to outliers as well as probability estimates.
            # 'squared_hinge' is like hinge but is quadratically penalized. 'perceptron' is the linear loss used by the perceptron algorithm.
            # The other losses are designed for regression but can be useful in classification as well; see SGDRegressor for a description.
        ),
        sgd_params,
        scoring=
        'roc_auc',  # A string (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y).
        cv=20  # If an integer is passed, it is the number of folds.
    )
    # Fit the classifier according to the given training data.
    modelSGD.fit(trainAfterFit, predCol)

    print(modelSGD.cv_results_)
    '''
    Contains scores for all parameter combinations in param_grid. Each entry corresponds to one parameter setting. Each named tuple has the attributes:
    parameters, a dict of parameter settings
    mean_validation_score, the mean score over the cross-validation folds
    cv_validation_scores, the list of scores for each fold
    '''
    # Make prediction on fitted test data. These are Probability estimates. The returned estimates for all classes are ordered by the label of classes.
    SGDresult = modelSGD.predict_proba(testAfterFit)[:, 1]
    # Create and store predictions in DataFrame and csv
    SGDoutput = pd.DataFrame(data={"id": test["id"], "sentiment": SGDresult})
    SGDoutput.to_csv('../results/SGDPredictions.csv', index=False, quoting=3)


# if __name__ == '__main__':
#     main()
Esempio n. 9
0
    def sgd(self):
        # Regularization parameter
        # sgd_params = {'alpha': [ 0.18,0.17,0.19,0.185]}
        sgd_params = {'alpha': [1e-1, 0.5, 1, 1.5]}

        clf = GridSearchCV(
            SGD(max_iter=50, random_state=0, loss='modified_huber', n_jobs=4),
            sgd_params,
            scoring='roc_auc',
            cv=20)  # Find out which regularization parameter works the best.

        clf.fit(self.X_train, self.Y_Train)
        print("using SGD, Best: %f using %s" %
              (clf.best_score_, clf.best_params_))
        self.best_clf = clf.best_estimator_
        return self.best_clf
Esempio n. 10
0
def createPipeline(df, tLabel, DROP_FIELDS):
    transformer = RBF(gamma=0.001, n_components=300, random_state=1)
    sgd = SGD(loss='squared_epsilon_insensitive',
              penalty='l2',
              alpha=0.001,
              n_iter=1500,
              epsilon=0.001,
              learning_rate='invscaling',
              warm_start=False,
              shuffle=False)
    components = [('transformer', transformer), ('sgd', sgd)]
    model = Pipeline(components)
    X = df.drop(DROP_FIELDS, axis=1).copy()
    y = df[tLabel].copy()
    X = X.drop(tLabel, axis=1)
    model = model.fit(X, y)
    return model
Esempio n. 11
0
def new_sgd():
    args = {
            }
    return SGD(**args)
Esempio n. 12
0
def new_sgd(k):
    args = {
        "n_iter": k,
    }
    return SGD(**args)
Esempio n. 13
0
svc_parameters = {"C": C_range, "kernel": ("linear", "poly", "rbf", "sigmoid")}
lgr_parameters = {"penalty": ("l1", "l2"), "C": C_range}
sgd_parameters = {
    "loss": ("hinge", "log", "modified_huber", "squared_hinge", "perceptron",
             "squared_loss", "huber", "epsilon_insensitive",
             "squared_epsilon_insensitive"),
    "penalty": ("none", "l2", "l1", "elasticnet")
}
rfc_parameters = {"n_estimators": np.arange(50, 201, 10)}
efc_parameters = {}
# abc_parameters = {}
# gbc_parameters = {}

classifiers = [[LDA(), "LDA", lda_parameters], [SVC(), "SVC", svc_parameters],
               [LGR(), "LogReg", lgr_parameters],
               [SGD(), "StochGradDesc", sgd_parameters],
               [RFC(), "Random Forest", rfc_parameters],
               [EFC(), "Extra Tree", efc_parameters]]

# [KNN(), "KNearestNeighbor", knn_parameters],
# ,
#     [ABC(), "AdaBoost", abc_parameters],
#     [GBC(), "Gradient Boosting Classifier", gbc_parameters]

count = 0
clf_count = len(classifiers)
channels = data["X_train"].shape[1]

# T = Normalizer()

cv = ShuffleSplit(n_splits, test_size)
    def _ModelSetting(self, model_name, cv_train_p=None):
        self.model_p = ''
        self.clf = None

        if model_name == 'K-MEANS':
            pars = [cv_train_p, 50000, 0.00001]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = KMEANS(n_clusters=pars[0],
                              init='k-means++',
                              n_init=10,
                              max_iter=pars[1],
                              tol=pars[2],
                              precompute_distances='auto',
                              verbose=0,
                              random_state=None,
                              copy_x=True,
                              n_jobs=4)
        if model_name == 'K-MINI':
            pars = [cv_train_p, 10000, 0.0]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = KMINI(n_clusters=pars[0],
                             init='k-means++',
                             max_iter=pars[1],
                             batch_size=100,
                             verbose=0,
                             compute_labels=True,
                             random_state=None,
                             tol=pars[2],
                             max_no_improvement=10,
                             init_size=None,
                             n_init=3,
                             reassignment_ratio=0.01)

        if model_name == 'PAC':
            self.clf = PAC(C=1.0,
                           fit_intercept=True,
                           n_iter=5,
                           shuffle=True,
                           verbose=0,
                           loss='hinge',
                           n_jobs=1,
                           random_state=None,
                           warm_start=False,
                           class_weight='balanced')
        if model_name == 'PCP':
            self.clf = PCP(penalty=None,
                           alpha=0.0001,
                           fit_intercept=True,
                           n_iter=20,
                           shuffle=False,
                           verbose=0,
                           eta0=1.0,
                           n_jobs=6,
                           random_state=0,
                           class_weight=None,
                           warm_start=False)
        if model_name == 'NB':
            self.clf = NB()

        if model_name == 'SGD':
            pars = [1e-4, None, 'hinge', 200]
            # loss = 'modified_huber', 'hinge' n_iter = 5
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = SGD(loss=pars[2],
                           penalty='l2',
                           alpha=pars[0],
                           l1_ratio=0.15,
                           fit_intercept=True,
                           n_iter=pars[3],
                           shuffle=True,
                           verbose=0,
                           epsilon=0.1,
                           n_jobs=1,
                           random_state=None,
                           learning_rate='optimal',
                           eta0=0.0,
                           power_t=0.5,
                           class_weight=pars[1],
                           warm_start=False,
                           average=False)
        if model_name == 'LSVC':
            pars = [1e-5, 1e-2, 'balanced', 2000]
            # 'crammer_singer'
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = LSVC(penalty='l2',
                            loss='squared_hinge',
                            dual=False,
                            tol=pars[0],
                            C=pars[1],
                            multi_class='ovr',
                            fit_intercept=True,
                            intercept_scaling=1,
                            class_weight=pars[2],
                            verbose=0,
                            random_state=None,
                            max_iter=pars[3])
        if model_name == 'CSVC':
            pars = [8, 'rbf', 0.00048828125, 'balanced']
            pars = [1e2, 'linear', 1e-3, 'auto']
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = CSVC(C=pars[0],
                            kernel=pars[1],
                            degree=3,
                            gamma=pars[2],
                            coef0=0.0,
                            shrinking=True,
                            probability=True,
                            tol=1e-3,
                            cache_size=5000,
                            class_weight=pars[3],
                            verbose=False,
                            max_iter=-1,
                            random_state=None)
        if model_name == 'NSVC':
            #pars = [0.5, 'rbf', 0.00048828125, 'auto']
            pars = [0.5, 'rbf', 'auto', 'auto']
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = NSVC(nu=pars[0],
                            kernel=pars[1],
                            degree=3,
                            gamma=pars[2],
                            coef0=0.0,
                            shrinking=True,
                            probability=False,
                            tol=0.001,
                            cache_size=500,
                            class_weight=pars[3],
                            verbose=False,
                            max_iter=-1,
                            decision_function_shape=None,
                            random_state=None)
        if model_name == 'LR':
            pars = ['l2', 1e+2, 'balanced', 3000]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = LR(penalty=pars[0],
                          dual=False,
                          tol=0.0001,
                          C=pars[1],
                          fit_intercept=True,
                          intercept_scaling=1,
                          class_weight=pars[2],
                          random_state=None,
                          solver='liblinear',
                          max_iter=pars[3],
                          multi_class='ovr',
                          verbose=0,
                          warm_start=False,
                          n_jobs=1)
        if model_name == 'LinR':
            pars = [True]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = LinR(fit_intercept=True,
                            normalize=pars[0],
                            copy_X=True,
                            n_jobs=1)
        if model_name == 'DT':
            pars = [8, 'balanced']
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = DT(criterion='gini',
                          splitter='best',
                          max_depth=pars[0],
                          min_samples_split=1,
                          min_samples_leaf=1,
                          min_weight_fraction_leaf=0.0,
                          max_features=None,
                          random_state=None,
                          max_leaf_nodes=None,
                          class_weight=pars[1],
                          presort=False)
        if model_name == 'RF':
            pars = [5, 7, 'balanced']
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = RF(n_estimators=pars[0],
                          criterion='gini',
                          max_depth=pars[1],
                          min_samples_split=2,
                          min_samples_leaf=1,
                          min_weight_fraction_leaf=0.0,
                          max_features='auto',
                          max_leaf_nodes=None,
                          bootstrap=True,
                          oob_score=False,
                          n_jobs=2,
                          random_state=None,
                          verbose=0,
                          warm_start=False,
                          class_weight=pars[2])
        if model_name == 'ADA':
            pars = [13, 18, 0.05]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = ADA(base_estimator=DT(max_depth=pars[0],
                                             class_weight='balanced'),
                           n_estimators=50,
                           learning_rate=1.0,
                           algorithm='SAMME.R',
                           random_state=None)
        if model_name == 'GBM':
            pars = [20, 0.03, 13]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = GBM(loss='deviance',
                           learning_rate=pars[1],
                           n_estimators=pars[0],
                           subsample=1.0,
                           min_samples_split=2,
                           min_samples_leaf=1,
                           min_weight_fraction_leaf=0.0,
                           max_depth=pars[2],
                           init=None,
                           random_state=None,
                           max_features=None,
                           verbose=0,
                           max_leaf_nodes=None,
                           warm_start=False,
                           presort='auto')
path = '...'
os.chdir(path)

d = {}
test_size = 0.1

datasets = []
for i in os.listdir(os.getcwd()):
    datasets.append(i)

# set the ids of the folds that lead to a different random_state of kFold, leading to len(folds) * 10-cross-validation processes
folds = [1, 2, 3, 4, 5, 7, 23, 66, 123, 2018]
# else, set just a seed into the folds list for one only 10-cross-validation procedure
folds = [23]

learners = [SGD(loss= 'log') ,  SGD(loss= 'modified_huber'), SGD(loss= 'log' , penalty = 'l1') , SGD(loss= 'log' , penalty = 'elasticnet') , SGD(loss= 'modified_huber' , penalty = 'l1') , SGD(loss= 'modified_huber' , penalty = 'elasticnet') , MNB(), BNB()]

for t in learners:
    
    l = []
    print '#### \t' , t, '\t ####' 
    for x in range(0, len(datasets)):

        lea = copy.deepcopy(t)
        acc = []
        stdev = []
        
        dataframe = read_csv(datasets[x] , skiprows = 1 , header=None)
        dataframe = dataframe.dropna()
        dataset = dataframe.values
        print
Esempio n. 16
0
File: sgd.py Progetto: icrn/pythonML
X_test_chuli = X_all[lentrain:]
X_train_chuli.shape

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import SGDClassifier as SGD

folds = StratifiedKFold(n_splits=35, shuffle=False, random_state=2019)
oof = np.zeros(X_train_chuli.shape[0])
predictions = np.zeros(X_test_chuli.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli,
                                                       y_train)):
    print("Fold :{}".format(fold_ + 1))
    trn_data = X_train_chuli[trn_idx]
    trn_label = y_train[trn_idx]
    val_data = X_train_chuli[val_idx]
    val_label = y_train[val_idx]
    model_SGD = SGD(alpha=0.00001, random_state=2, shuffle=True, loss='log')
    model_SGD.fit(trn_data, trn_label)  # Fit the model.
    print("auc score: {:<8.5f}".format(
        metrics.roc_auc_score(val_label,
                              model_SGD.predict_proba(val_data)[:, 1])))
    predictions += model_SGD.predict_proba(X_test_chuli)[:, 1] / folds.n_splits

print(len(predictions))
predictions[:4]
SGD_output = pd.DataFrame({"ID": df_test["ID"], "Pred": predictions})
SGD_output.to_csv('SGD_new.csv', index=False)
Esempio n. 17
0
# coding = UTF-8
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as plt
import numpy as np

X, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.6)
clf = SGD(loss='hinge', alpha=0.01, max_iter=200, fit_intercept=True)
clf.fit(X, y)
print("回归系数:", clf.coef_)
print("偏差", clf.intercept_)
print("##################")
print(X.shape)
print(y.shape)
Esempio n. 18
0
user_job, user_skill, skillset_numeric, skillsetjob_numeric = prj.get_model()

jobs = user_job.shape[1]
features = user_skill.shape[1]
parameters = zeros((features, jobs))

for i in range(user_job.shape[1]):
    #print i
    index = user_job[:, i] == 1
    #print index
    y = user_job[:, i][index]
    X = user_skill[index]
    if True in index:
        #print y.shape
        #print X.shape
        clf = SGD().fit(X, y)
        coefs = clf.coef_
        #print coefs.shape
        parameters[:, i] = coefs
    else:
        pass
#print parameters

threshold = 0.2
userid = 100
# nans for user 1

user_nan = isnan(user_job[userid])
for j, i in enumerate(user_nan):
    #print i
    # j is job id
Esempio n. 19
0
    def SGD(self, alpha, penalty):
        model = SGD(alpha=alpha, penalty=penalty)
        model.fit(self.x_train, self.y_train)
        pred = model.predict(self.x_test)

        return pred
Esempio n. 20
0
vectorizer = TfidfVectorizer(vocabulary=word_index)
X = X1_train + X1_valid + X2_train + X2_valid
#print(len(X))
X = vectorizer.fit_transform(X)
#print(X.shape)

#print(vectorizer.get_feature_names())
X1_train = X[:l1].toarray()
X1_valid = X[l1:l2].toarray()
X2_train = X[l2:l3].toarray()
X2_valid = X[l3:].toarray()

X_train = []
for i in range(l1):
    X_train.append(list(X1_train[i]) + list(X2_train[i]))
X_train = sparse.csr_matrix(X_train)

X_valid = []
for i in range(len(X1_valid)):
    X_valid.append(list(X1_valid[i]) + list(X2_valid[i]))
X_valid = sparse.csr_matrix(X_valid)

model_SGD = SGD(alpha=0.0008,
                random_state=2,
                shuffle=True,
                loss='log',
                max_iter=1e4)
model_SGD.fit(X_train, y_train)  # Fit the model.
print("precision score: {:<8.5f}".format(
    precision_score(y_valid, model_SGD.predict(X_valid))))
Esempio n. 21
0
aucs = []
for fold_, (train_index, test_index) in enumerate(folds.split(train_data, train_label)):
    print("Fold :{}".format(fold_ + 1))
    cv_train_data, cv_train_label= train_data[train_index], train_label[train_index]
    cv_test_data, cv_test_label = train_data[test_index], train_label[test_index]

    # Logistic Regression
    # model = LR(solver='lbfgs')
    # model.fit(cv_train_data, cv_train_label)
    # auc = metrics.roc_auc_score(cv_test_label, model.predict_proba(cv_test_data)[:, 1])
    # predictions += model.predict_proba(test_data)[:, 1] / folds.n_splits

    # SGD classifier
    # model = LogisticRegression(solver="lbfgs",max_iter=3000)
    model = SGD(alpha=0.00001, penalty='l2', tol=10000, shuffle=True, loss='log')
    # 朴素贝叶斯
    # model = MultinomialNB()
    #k近邻
    # model = KNeighborsClassifier()
    # model = svm.LinearSVC()
    # 随机森林
    # model = RandomForestClassifier()

    # model = CalibratedClassifierCV(model, cv=5)
    model.fit(cv_train_data, cv_train_label)
    auc = metrics.roc_auc_score(cv_test_label, model.predict_proba(cv_test_data)[:, 1])
    predictions += model.predict_proba(test_data)[:, 1] / folds.n_splits

    # model = SVC(gamma='auto', probability=True)
    # model.fit(cv_train_data, cv_train_label)
Esempio n. 22
0
    print "MNB效果"
    print(
        "20 Fold CV Score for Multinomial Naive Bayes: ",
        np.mean(
            cross_val_score(model_NB, train_x, label, cv=20,
                            scoring='roc_auc')))

    # SGD
    from sklearn.linear_model import SGDClassifier as SGD

    sgd_params = {
        'alpha': [0.00006, 0.00007, 0.00008, 0.0001, 0.0005]
    }  # Regularization parameter

    model_SGD = GridSearchCV(
        SGD(random_state=0, shuffle=True, loss='modified_huber'),
        sgd_params,
        scoring='roc_auc',
        cv=20)  # Find out which regularization parameter works the best.

    model_SGD.fit(train_x, label)  # Fit the model.
    print "SGD效果"
    print(model_SGD.grid_scores_)

    # 分别输出 LR、MNB、SGD 的结果
    # LR_result = model_LR.predict_proba(test_x)[:,1]  # We only need the probabilities that the movie review was a 7 or greater.
    LR_result = model_LR.predict(
        test_x
    )  # We only need the probabilities that the movie review was a 7 or greater.
    LR_output = pd.DataFrame(data={
        "id": test["id"],
# In[17]:

columns = ['GoB' , 'text' , 'final_rule']
X = result.drop(columns,axis = 1)
y = result['GoB'].values


# # Создаем модельки

# In[53]:

from sklearn import cross_validation, grid_search, linear_model, metrics
from sklearn.linear_model import SGDClassifier as SGD

classifier = SGD()
classifier

parameters_grid = {
    'loss' : ['log'],
    'penalty' : ['l1'],
    'n_iter' : [2000],
    'alpha' : [0.000775],
    'learning_rate' : ['optimal'],
    'eta0' : [0.0001],
}


# In[54]:

train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(X, y, 
 def SGD_regression(self):
     sgd = SGD(eta0=0.1, learning_rate='adaptive')
     sgd.fit(self.train_X, self.train_y)
     self.y_pre_train = sgd.predict(self.train_X)
     self.y_pre_test = sgd.predict(self.test_X)
Esempio n. 25
0
def main():

    global count, path_csv, test_size
    path_csv = ''
    random_shuffle_id = 23

    for file_csv in l_csv:
        book = xlwt.Workbook(encoding="utf-8")
        start = datetime.now()
        folds = [1, 2]  #, 3, 4, 5, 7, 23, 66, 123, 2018]

        for fold in folds:
            message = "Sheet " + str(fold)
            sheet1 = book.add_sheet(message)

            SIZE = (1 - test_size) * split_train_test(
                test_size, 1, fold, 0, random_shuffle_id, file_csv, path_csv)
            count = -1

            for col in range(
                    1, 2
            ):  #we could increase the second argument of range, in case that more we would like to run the experiment again for the same fold with different shuffle e.g. 5x2 evaluation

                print '***********file*********** = ', file_csv
                print '***********col************ = ', col
                print '***********fold*********** = ', fold
                print 'SIZE of L + U              = ', int(SIZE)
                print

                myspace = np.linspace(int(0.05 * SIZE),
                                      int(0.25 * SIZE) + 1, 3)
                learners = [
                    SGD(loss='log'),
                    SGD(loss='modified_huber'),
                    SGD(loss='log', penalty='l1'),
                    SGD(loss='log', penalty='elasticnet'),
                    SGD(loss='modified_huber', penalty='l1'),
                    SGD(loss='modified_huber', penalty='elasticnet')
                ]

                for lea in learners:

                    counter_j = -1
                    counter_jj = -1
                    count = count + 1
                    my_clf = lea
                    print str(my_clf)[0:str(my_clf).find('(')] + '(' + str(
                        my_clf)[str(my_clf).find('loss'):str(my_clf).
                                find(',',
                                     str(my_clf).find('loss'))] + ' , ' + str(
                                         my_clf
                                     )[str(my_clf).find('penalty'):str(my_clf).
                                       find(',',
                                            str(my_clf).find('penalty'))] + ')'

                    for j in myspace:

                        j = int(round(j))
                        counter_j = counter_j + 1
                        n_labeled = j  # number of samples that are initially labeled
                        print '**** Labeled instances = ', j

                        metrics = ['lc', 'entropy', 'sm', 'random']

                        for jj in metrics:

                            trn_ds, tst_ds, y_train, fully_labeled_trn_ds, initial_instances = split_train_test(
                                test_size, n_labeled, fold, random_shuffle_id,
                                col, file_csv, path_csv)
                            trn_ds2 = copy.deepcopy(trn_ds)
                            lbr = IdealLabeler(fully_labeled_trn_ds)
                            train_data = int(initial_instances -
                                             initial_instances * test_size)
                            quota = len(
                                y_train
                            ) - n_labeled  # number of samples to query

                            # Comparing UncertaintySampling strategy with RandomSampling.
                            counter_jj = counter_jj + 1

                            if jj != 'random':

                                print '**** Metric of Uncertainty Sampling strategy = ', jj
                                qs1 = UncertaintySampling(
                                    trn_ds,
                                    kernel=jj,
                                    model=SklearnProbaAdapter(my_clf))
                                model = SklearnProbaAdapter(my_clf)
                                E_out_1, ttt, trn_ds_returned, aa, bb = run(
                                    trn_ds, tst_ds, lbr, model, qs1, quota, j)

                            else:

                                print '**** Baseline Sampling strategy = ', jj
                                qs1 = RandomSampling(
                                    trn_ds, model=SklearnProbaAdapter(my_clf))
                                model = SklearnProbaAdapter(my_clf)
                                E_out_1, ttt, trn_ds_returned, aa, bb = run(
                                    trn_ds, tst_ds, lbr, model, qs1, quota, j)

                            if count != 0:
                                down_cells = len(E_out_1) + 9
                            else:
                                down_cells = 0

                            i = 8 + down_cells * count

                            sheet1.write(i - 7, counter_jj + counter_j,
                                         jj)  # metric of incertaintly
                            sheet1.write(i - 6, counter_jj + counter_j,
                                         quota)  # amount of U
                            sheet1.write(i - 5, counter_jj + counter_j,
                                         aa)  # instanes inserted per iteration
                            sheet1.write(i - 4, counter_jj + counter_j,
                                         bb)  # amount of L
                            sheet1.write(
                                i - 3, counter_jj + counter_j,
                                trn_ds_returned.len_labeled()
                            )  # amount of training data after active learning procedure
                            sheet1.write(
                                i - 2, counter_jj + counter_j,
                                trn_ds_returned.len_unlabeled()
                            )  # amount of unlabeled instances after active learning procedure

                            sheet1.write(
                                i - 8, counter_jj + counter_j,
                                str(my_clf)[0:str(my_clf).find('(')] + '(' +
                                str(my_clf)[str(my_clf).find('loss'):str(
                                    my_clf).find(',',
                                                 str(my_clf).find('loss'))] +
                                ' , ' + str(my_clf)
                                [str(my_clf).find('penalty'):str(my_clf).
                                 find(',',
                                      str(my_clf).find('penalty'))] + ')')
                            for n in E_out_1:

                                sheet1.write(i, counter_jj + counter_j, n)
                                i = i + 1
                            #print 'error in last iteration: ', E_out_1[-1]
                            print
        print("> Compilation Time : %s",
              (datetime.now() - start).total_seconds())
        print("AIAIexperiment_" + file_csv[0:-4] + ".xls")
        book.save("AIAIexperimetn_" + file_csv[0:-4] + "_incremental_" +
                  str(fold) + ".xls")

        times_l.append((datetime.now() - start).total_seconds())
 # 			clf.fit(np.abs(fv_train), label_train)
 # 			prob_train = np.delete(clf.predict_proba(fv_train),1, axis=1).flatten()
 # 			prob_test  = np.delete(clf.predict_proba(fv_test) ,1, axis=1).flatten()
 # 			score_train = np.append(score_train, [prob_train], axis=0)
 # 			score_test  = np.append(score_test , [prob_test] , axis=0)
 # 		print 'execution time:{:.2f}s'.format(T.time()-tStart)
 #==============================================================================
 ########## SGD
 tStart = T.time()
 M = 'SGD'
 print 'FV_{}\tmix={}\tSGD'.format(FV, MIXTURE)
 for ALPHA in [0.0001]:
     for i, PENALTY in enumerate(['l2', 'l1', 'elasticnet']):
         for LR in ['optimal']:  #,'constant','invscaling']:
             model.append(FV + '_' + M + str(PENALTY))
             clf = SGD(penalty=PENALTY, alpha=ALPHA,
                       learning_rate=LR)  #, class_weight='balanced')
             clf.fit(fv_train, label_train)
             prob_train = clf.decision_function(fv_train)
             prob_test = clf.decision_function(fv_test)
             if (i == 0):  # and (FV=='gen'):
                 score_train = [prob_train]
                 score_test = [prob_test]
             else:
                 score_train = np.append(score_train, [prob_train],
                                         axis=0)
                 score_test = np.append(score_test, [prob_test], axis=0)
 ###
 score_train = np.transpose(score_train)
 score_test = np.transpose(score_test)
 joblib.dump(
     score_train, FV_pth + 'score/' + str(MIXTURE) + '_' + FV + '_' +
Esempio n. 27
0
#Combine both to fit the TFIDF vectorization.
allData = trainData + testData
lenTrain = len(trainData)
tfv.fit(allData)
allData = tfv.transform(allData)

#Separate back into training and dev sets.
train = allData[:lenTrain]
test = allData[lenTrain:]

#Regularization parameter
sgdParams = {'alpha': [0.00006, 0.00007, 0.00008, 0.0001, 0.0005]}

#Find out which regularization parameter works the best.
modelSGD = GridSearchCV(SGD(random_state=0,
                            shuffle=True,
                            loss='modified_huber'),
                        sgdParams,
                        scoring='roc_auc',
                        cv=20)

#Fit the model.
modelSGD.fit(train, trainSet['sentiment'])
SGDResult = modelSGD.predict_proba(test)[:, 1]
SGDOutput = pd.DataFrame(data={
    "id": testSet['id'],
    "review": testSet['review'],
    "sentiment": SGDResult
})
SGDOutput.to_excel("Result.xlsx", sheet_name='Result', index=False)
print(modelSGD.best_score_)
Esempio n. 28
0
NaiveBayes = MNB()
NaiveBayes.fit(X_train, Y_train)
print("Training acc:", NaiveBayes.score(X_train, Y_train), "\nValidation acc:",
      NaiveBayes.score(X_val, Y_val))
# Results:
# Training acc: 0.99 
# Validation acc: 0.9325


 # SGD

 In[ ]:


sgd = SGD(max_iter=5, random_state=0,loss='modified_huber',n_jobs=4)
sgd.fit(X_train, Y_train)
print("Training acc:", sgd.score(X_train, Y_train), "\nValidation acc:",
      sgd.score(X_val, Y_val))
# Results:
# Training acc: 0.995 
# Validation acc: 0.88

# In[ ]:


parameters = {'alpha': [0.1, 0.5, 1, 1.5]}
sgd_search = GridSearchCV(sgd,parameters , scoring='roc_auc', cv=20)  
sgd_search.fit(X_train, Y_train)
print("The best parameters: " + str(sgd_search.best_params_))
# The best parameters: {'alpha': 0.1}
Esempio n. 29
0
from sklearn.linear_model import SGDClassifier as SGD
import matplotlib.pyplot as plt
import numpy as np

x=[[0,0],[1,1],[2,2],[3,3]]
y=[0,1,2,3]
clf = SGD(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
clf.fit(x,y)
print(clf.predict([[4,4]]))
print(clf.coef_)
print(clf.intercept_)
print(clf.decision_function([[2,2]]))
                            ngram_range=(1, 2))

    #  This is an example of our parameter grid that we would use for searching
    #  It was found that sometimes having a really large parameter grid took too long
    #   to be useful (example 700+ fits would take more than an 3 hours running on 8 cores)
    param_grid_2 = [{'vect__max_df': [.75, .5, .1, .075]}]

    #  This is our pipeline to train the model with
    #  We use a TfidfVectorizer and a Stochastic Gradient Descent Classifier
    #  More information can be found in the write-up and on the GitHub README
    lr_tfidf = Pipeline([('vect', tfidf),
                         ('clf',
                          SGD(loss='modified_huber',
                              alpha=0.00015,
                              n_iter=np.ceil(10**6 / len(df['review'])),
                              l1_ratio=0.05,
                              penalty='l2',
                              shuffle=False,
                              learning_rate='optimal'))])

    #  Beginning of example for how we used GridSearchCV to find parameters
    #  This is only here to showcase how we found parameters - it would use parameter_grid2 from above
    # gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid_2,
    #                            scoring='accuracy',
    #                            cv=5,
    #                            verbose=1,
    #                            n_jobs=6)  # how many cores to run on - this gets all of them
    #
    #
    #
    # gs_lr_tfidf.fit(X_train, y_train)