def __init__(self, cache_size=500, tol=0.01, kernel="rbf", skewedness=0.0005, gamma=1/40, use_SGD=False, n_iter=25, alpha_g=0.001, alpha_all=0.005): # last line is only for SGD self.groups = list(grouping.get_group2class().keys()) self.cache_size = cache_size self.tol = tol self.kernel = kernel self.use_SGD = use_SGD self.n_iter = n_iter self.alpha_g = alpha_g self.alpha_all = alpha_all self.skewedness = skewedness self.gamma = gamma self.rbf_feature = RBFSampler(gamma=gamma, n_components=500) self.chi2_feature_0 = SkewedChi2Sampler(skewedness=skewedness, n_components=300) self.chi2_feature_1 = SkewedChi2Sampler(skewedness=skewedness, n_components=300) self.chi2_feature_2 = SkewedChi2Sampler(skewedness=skewedness, n_components=300) if (use_SGD): self.SVMs = [SGD(alpha=alpha_g, epsilon=0.1, n_iter=n_iter*4//5, n_jobs=4) for _ in self.groups] self.SVM_all = SGD(alpha=alpha_all, epsilon=0.1, n_iter=n_iter, n_jobs=4) else: self.SVMs = [SVC(cache_size=cache_size,tol=tol,kernel=kernel,C=500) for _ in self.groups] self.SVM_all = SVC(cache_size=cache_size,tol=tol,kernel=kernel,C=1000)
def models(self) -> [RegressorMixin]: if self._models is None: self._models = [LinearSVR(), SVR(), SVR(C=100), SVR(kernel='poly'), LinearRegression(), Lasso(), ElasticNet(), SGD(), SGD('epsilon_insensitive'), SGD('squared_epsilon_insensitive'), Baseline(strategy='median')] return self._models if isinstance(self._models, list) else [self._models]
def try_params(n_iterations, params, data): n_iterations = int(round(n_iterations)) print("n_iterations:", n_iterations) pprint(params) if params['scaler']: scaler = eval("{}()".format(params['scaler'])) x_train_ = scaler.fit_transform(data['x_train'].astype(float)) x_test_ = scaler.transform(data['x_test'].astype(float)) local_data = { 'x_train': x_train_, 'y_train': data['y_train'], 'x_test': x_test_, 'y_test': data['y_test'] } else: local_data = data # we need a copy because at the next small round the best params will be re-used params_ = dict(params) params_.pop('scaler') clf = SGD(n_iter=n_iterations, **params_) return train_and_eval_sklearn_classifier(clf, local_data)
def __init__(self, env): self.env = env # sampleing envrionment state in order to featurize it. observation_examples = np.array([self.env.observation_space.sample() for x in range(10000)]) # Feature Preprocessing: Normalize to zero mean and unit variance # We use a few samples from the observation space to do this self.scaler = Scaler() self.scaler.fit(observation_examples) # Used to convert a state to a featurizes represenation. # We use RBF kernels with different variances to cover different parts of the space self.featurizer = FeatureUnion([ ("rbf1", RBF(gamma=5.0, n_components=100)), ("rbf2", RBF(gamma=2.0, n_components=100)), ("rbf3", RBF(gamma=1.0, n_components=100)), ("rbf4", RBF(gamma=0.5, n_components=100)) ]) self.featurizer.fit(self.scaler.transform(observation_examples)) # action model for SGD regressor self.action_models = [] self.nA = self.env.action_space.n for na in range(self.nA): model = SGD(learning_rate="constant") model.partial_fit([self.__featurize_state(self.env.reset())], [0]) self.action_models.append(model)
def createPipeline(df,tLabel,DROP_FIELDS): model = SGD(loss='squared_epsilon_insensitive',penalty='l2',alpha=0.001,n_iter=1500,epsilon=0.001,learning_rate ='invscaling',warm_start=False,shuffle=False) X = df.drop(DROP_FIELDS,axis=1).copy() y = df[tLabel].copy() X = X.drop(tLabel,axis=1) model = model.fit(X,y) return model
def get_sgd_model(): sgd_params = {'alpha': [0.00006, 0.00007, 0.00008, 0.0001, 0.0005]} model_SGD = GridSearchCV(SGD(random_state=0, shuffle=True, loss='modified_huber'), sgd_params, scoring='roc_auc', cv=20) return model_SGD
def SGD(self, alpha, penalty): model = SGD(alpha=alpha, penalty=penalty) model.fit(self.x_train, self.y_train) pred = model.predict(self.x_test) #scores3 = cross_val_score(model, self.x_train, self.y_train, cv=5, scoring='accuracy') #print("Score of LDA in Cross Validation", scores3.mean() * 100) print(" SGD : accurancy_is", metrics.accuracy_score(self.y_test, pred)) return pred
def pred(): # Load fitted training data trainAfterFit = pickle.load(open("../picks/fittedTrainData.pkl", "rb")) # Load prediction column predCol = pickle.load(open("../picks/predCol", "rb")) # Load fitted test data testAfterFit = pickle.load(open("../picks/fittedTestData.pkl", "rb")) # Load test data test = pd.read_csv('../data/testData.tsv', header=0, delimiter="\t", quoting=3) # Constant that multiplies the regularization term. Defaults to 0.0001 sgd_params = {'alpha': [0.00006, 0.00007, 0.00008, 0.0001, 0.0005]} # Initialize SGD classifier modelSGD = GridSearchCV( SGD( random_state= 0, # The seed of the pseudo random number generator to use when shuffling the data. shuffle= True, # Whether or not the training data should be shuffled after each epoch. Defaults to True. loss='modified_huber' # The loss function to be used. Defaults to 'hinge', which gives a linear SVM. # The 'log' loss gives logistic regression, a probabilistic classifier. # 'modified_huber' is another smooth loss that brings tolerance to outliers as well as probability estimates. # 'squared_hinge' is like hinge but is quadratically penalized. 'perceptron' is the linear loss used by the perceptron algorithm. # The other losses are designed for regression but can be useful in classification as well; see SGDRegressor for a description. ), sgd_params, scoring= 'roc_auc', # A string (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y). cv=20 # If an integer is passed, it is the number of folds. ) # Fit the classifier according to the given training data. modelSGD.fit(trainAfterFit, predCol) print(modelSGD.cv_results_) ''' Contains scores for all parameter combinations in param_grid. Each entry corresponds to one parameter setting. Each named tuple has the attributes: parameters, a dict of parameter settings mean_validation_score, the mean score over the cross-validation folds cv_validation_scores, the list of scores for each fold ''' # Make prediction on fitted test data. These are Probability estimates. The returned estimates for all classes are ordered by the label of classes. SGDresult = modelSGD.predict_proba(testAfterFit)[:, 1] # Create and store predictions in DataFrame and csv SGDoutput = pd.DataFrame(data={"id": test["id"], "sentiment": SGDresult}) SGDoutput.to_csv('../results/SGDPredictions.csv', index=False, quoting=3) # if __name__ == '__main__': # main()
def sgd(self): # Regularization parameter # sgd_params = {'alpha': [ 0.18,0.17,0.19,0.185]} sgd_params = {'alpha': [1e-1, 0.5, 1, 1.5]} clf = GridSearchCV( SGD(max_iter=50, random_state=0, loss='modified_huber', n_jobs=4), sgd_params, scoring='roc_auc', cv=20) # Find out which regularization parameter works the best. clf.fit(self.X_train, self.Y_Train) print("using SGD, Best: %f using %s" % (clf.best_score_, clf.best_params_)) self.best_clf = clf.best_estimator_ return self.best_clf
def createPipeline(df, tLabel, DROP_FIELDS): transformer = RBF(gamma=0.001, n_components=300, random_state=1) sgd = SGD(loss='squared_epsilon_insensitive', penalty='l2', alpha=0.001, n_iter=1500, epsilon=0.001, learning_rate='invscaling', warm_start=False, shuffle=False) components = [('transformer', transformer), ('sgd', sgd)] model = Pipeline(components) X = df.drop(DROP_FIELDS, axis=1).copy() y = df[tLabel].copy() X = X.drop(tLabel, axis=1) model = model.fit(X, y) return model
def new_sgd(): args = { } return SGD(**args)
def new_sgd(k): args = { "n_iter": k, } return SGD(**args)
svc_parameters = {"C": C_range, "kernel": ("linear", "poly", "rbf", "sigmoid")} lgr_parameters = {"penalty": ("l1", "l2"), "C": C_range} sgd_parameters = { "loss": ("hinge", "log", "modified_huber", "squared_hinge", "perceptron", "squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"), "penalty": ("none", "l2", "l1", "elasticnet") } rfc_parameters = {"n_estimators": np.arange(50, 201, 10)} efc_parameters = {} # abc_parameters = {} # gbc_parameters = {} classifiers = [[LDA(), "LDA", lda_parameters], [SVC(), "SVC", svc_parameters], [LGR(), "LogReg", lgr_parameters], [SGD(), "StochGradDesc", sgd_parameters], [RFC(), "Random Forest", rfc_parameters], [EFC(), "Extra Tree", efc_parameters]] # [KNN(), "KNearestNeighbor", knn_parameters], # , # [ABC(), "AdaBoost", abc_parameters], # [GBC(), "Gradient Boosting Classifier", gbc_parameters] count = 0 clf_count = len(classifiers) channels = data["X_train"].shape[1] # T = Normalizer() cv = ShuffleSplit(n_splits, test_size)
def _ModelSetting(self, model_name, cv_train_p=None): self.model_p = '' self.clf = None if model_name == 'K-MEANS': pars = [cv_train_p, 50000, 0.00001] self.model_p = '-'.join(str(p) for p in pars) self.clf = KMEANS(n_clusters=pars[0], init='k-means++', n_init=10, max_iter=pars[1], tol=pars[2], precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=4) if model_name == 'K-MINI': pars = [cv_train_p, 10000, 0.0] self.model_p = '-'.join(str(p) for p in pars) self.clf = KMINI(n_clusters=pars[0], init='k-means++', max_iter=pars[1], batch_size=100, verbose=0, compute_labels=True, random_state=None, tol=pars[2], max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01) if model_name == 'PAC': self.clf = PAC(C=1.0, fit_intercept=True, n_iter=5, shuffle=True, verbose=0, loss='hinge', n_jobs=1, random_state=None, warm_start=False, class_weight='balanced') if model_name == 'PCP': self.clf = PCP(penalty=None, alpha=0.0001, fit_intercept=True, n_iter=20, shuffle=False, verbose=0, eta0=1.0, n_jobs=6, random_state=0, class_weight=None, warm_start=False) if model_name == 'NB': self.clf = NB() if model_name == 'SGD': pars = [1e-4, None, 'hinge', 200] # loss = 'modified_huber', 'hinge' n_iter = 5 self.model_p = '-'.join(str(p) for p in pars) self.clf = SGD(loss=pars[2], penalty='l2', alpha=pars[0], l1_ratio=0.15, fit_intercept=True, n_iter=pars[3], shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=pars[1], warm_start=False, average=False) if model_name == 'LSVC': pars = [1e-5, 1e-2, 'balanced', 2000] # 'crammer_singer' self.model_p = '-'.join(str(p) for p in pars) self.clf = LSVC(penalty='l2', loss='squared_hinge', dual=False, tol=pars[0], C=pars[1], multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=pars[2], verbose=0, random_state=None, max_iter=pars[3]) if model_name == 'CSVC': pars = [8, 'rbf', 0.00048828125, 'balanced'] pars = [1e2, 'linear', 1e-3, 'auto'] self.model_p = '-'.join(str(p) for p in pars) self.clf = CSVC(C=pars[0], kernel=pars[1], degree=3, gamma=pars[2], coef0=0.0, shrinking=True, probability=True, tol=1e-3, cache_size=5000, class_weight=pars[3], verbose=False, max_iter=-1, random_state=None) if model_name == 'NSVC': #pars = [0.5, 'rbf', 0.00048828125, 'auto'] pars = [0.5, 'rbf', 'auto', 'auto'] self.model_p = '-'.join(str(p) for p in pars) self.clf = NSVC(nu=pars[0], kernel=pars[1], degree=3, gamma=pars[2], coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=500, class_weight=pars[3], verbose=False, max_iter=-1, decision_function_shape=None, random_state=None) if model_name == 'LR': pars = ['l2', 1e+2, 'balanced', 3000] self.model_p = '-'.join(str(p) for p in pars) self.clf = LR(penalty=pars[0], dual=False, tol=0.0001, C=pars[1], fit_intercept=True, intercept_scaling=1, class_weight=pars[2], random_state=None, solver='liblinear', max_iter=pars[3], multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) if model_name == 'LinR': pars = [True] self.model_p = '-'.join(str(p) for p in pars) self.clf = LinR(fit_intercept=True, normalize=pars[0], copy_X=True, n_jobs=1) if model_name == 'DT': pars = [8, 'balanced'] self.model_p = '-'.join(str(p) for p in pars) self.clf = DT(criterion='gini', splitter='best', max_depth=pars[0], min_samples_split=1, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=pars[1], presort=False) if model_name == 'RF': pars = [5, 7, 'balanced'] self.model_p = '-'.join(str(p) for p in pars) self.clf = RF(n_estimators=pars[0], criterion='gini', max_depth=pars[1], min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=2, random_state=None, verbose=0, warm_start=False, class_weight=pars[2]) if model_name == 'ADA': pars = [13, 18, 0.05] self.model_p = '-'.join(str(p) for p in pars) self.clf = ADA(base_estimator=DT(max_depth=pars[0], class_weight='balanced'), n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) if model_name == 'GBM': pars = [20, 0.03, 13] self.model_p = '-'.join(str(p) for p in pars) self.clf = GBM(loss='deviance', learning_rate=pars[1], n_estimators=pars[0], subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=pars[2], init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
path = '...' os.chdir(path) d = {} test_size = 0.1 datasets = [] for i in os.listdir(os.getcwd()): datasets.append(i) # set the ids of the folds that lead to a different random_state of kFold, leading to len(folds) * 10-cross-validation processes folds = [1, 2, 3, 4, 5, 7, 23, 66, 123, 2018] # else, set just a seed into the folds list for one only 10-cross-validation procedure folds = [23] learners = [SGD(loss= 'log') , SGD(loss= 'modified_huber'), SGD(loss= 'log' , penalty = 'l1') , SGD(loss= 'log' , penalty = 'elasticnet') , SGD(loss= 'modified_huber' , penalty = 'l1') , SGD(loss= 'modified_huber' , penalty = 'elasticnet') , MNB(), BNB()] for t in learners: l = [] print '#### \t' , t, '\t ####' for x in range(0, len(datasets)): lea = copy.deepcopy(t) acc = [] stdev = [] dataframe = read_csv(datasets[x] , skiprows = 1 , header=None) dataframe = dataframe.dropna() dataset = dataframe.values print
X_test_chuli = X_all[lentrain:] X_train_chuli.shape from sklearn.model_selection import KFold, StratifiedKFold from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.linear_model import SGDClassifier as SGD folds = StratifiedKFold(n_splits=35, shuffle=False, random_state=2019) oof = np.zeros(X_train_chuli.shape[0]) predictions = np.zeros(X_test_chuli.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train)): print("Fold :{}".format(fold_ + 1)) trn_data = X_train_chuli[trn_idx] trn_label = y_train[trn_idx] val_data = X_train_chuli[val_idx] val_label = y_train[val_idx] model_SGD = SGD(alpha=0.00001, random_state=2, shuffle=True, loss='log') model_SGD.fit(trn_data, trn_label) # Fit the model. print("auc score: {:<8.5f}".format( metrics.roc_auc_score(val_label, model_SGD.predict_proba(val_data)[:, 1]))) predictions += model_SGD.predict_proba(X_test_chuli)[:, 1] / folds.n_splits print(len(predictions)) predictions[:4] SGD_output = pd.DataFrame({"ID": df_test["ID"], "Pred": predictions}) SGD_output.to_csv('SGD_new.csv', index=False)
# coding = UTF-8 from sklearn.linear_model import SGDClassifier as SGD from sklearn.datasets.samples_generator import make_blobs import matplotlib.pyplot as plt import numpy as np X, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.6) clf = SGD(loss='hinge', alpha=0.01, max_iter=200, fit_intercept=True) clf.fit(X, y) print("回归系数:", clf.coef_) print("偏差", clf.intercept_) print("##################") print(X.shape) print(y.shape)
user_job, user_skill, skillset_numeric, skillsetjob_numeric = prj.get_model() jobs = user_job.shape[1] features = user_skill.shape[1] parameters = zeros((features, jobs)) for i in range(user_job.shape[1]): #print i index = user_job[:, i] == 1 #print index y = user_job[:, i][index] X = user_skill[index] if True in index: #print y.shape #print X.shape clf = SGD().fit(X, y) coefs = clf.coef_ #print coefs.shape parameters[:, i] = coefs else: pass #print parameters threshold = 0.2 userid = 100 # nans for user 1 user_nan = isnan(user_job[userid]) for j, i in enumerate(user_nan): #print i # j is job id
def SGD(self, alpha, penalty): model = SGD(alpha=alpha, penalty=penalty) model.fit(self.x_train, self.y_train) pred = model.predict(self.x_test) return pred
vectorizer = TfidfVectorizer(vocabulary=word_index) X = X1_train + X1_valid + X2_train + X2_valid #print(len(X)) X = vectorizer.fit_transform(X) #print(X.shape) #print(vectorizer.get_feature_names()) X1_train = X[:l1].toarray() X1_valid = X[l1:l2].toarray() X2_train = X[l2:l3].toarray() X2_valid = X[l3:].toarray() X_train = [] for i in range(l1): X_train.append(list(X1_train[i]) + list(X2_train[i])) X_train = sparse.csr_matrix(X_train) X_valid = [] for i in range(len(X1_valid)): X_valid.append(list(X1_valid[i]) + list(X2_valid[i])) X_valid = sparse.csr_matrix(X_valid) model_SGD = SGD(alpha=0.0008, random_state=2, shuffle=True, loss='log', max_iter=1e4) model_SGD.fit(X_train, y_train) # Fit the model. print("precision score: {:<8.5f}".format( precision_score(y_valid, model_SGD.predict(X_valid))))
aucs = [] for fold_, (train_index, test_index) in enumerate(folds.split(train_data, train_label)): print("Fold :{}".format(fold_ + 1)) cv_train_data, cv_train_label= train_data[train_index], train_label[train_index] cv_test_data, cv_test_label = train_data[test_index], train_label[test_index] # Logistic Regression # model = LR(solver='lbfgs') # model.fit(cv_train_data, cv_train_label) # auc = metrics.roc_auc_score(cv_test_label, model.predict_proba(cv_test_data)[:, 1]) # predictions += model.predict_proba(test_data)[:, 1] / folds.n_splits # SGD classifier # model = LogisticRegression(solver="lbfgs",max_iter=3000) model = SGD(alpha=0.00001, penalty='l2', tol=10000, shuffle=True, loss='log') # 朴素贝叶斯 # model = MultinomialNB() #k近邻 # model = KNeighborsClassifier() # model = svm.LinearSVC() # 随机森林 # model = RandomForestClassifier() # model = CalibratedClassifierCV(model, cv=5) model.fit(cv_train_data, cv_train_label) auc = metrics.roc_auc_score(cv_test_label, model.predict_proba(cv_test_data)[:, 1]) predictions += model.predict_proba(test_data)[:, 1] / folds.n_splits # model = SVC(gamma='auto', probability=True) # model.fit(cv_train_data, cv_train_label)
print "MNB效果" print( "20 Fold CV Score for Multinomial Naive Bayes: ", np.mean( cross_val_score(model_NB, train_x, label, cv=20, scoring='roc_auc'))) # SGD from sklearn.linear_model import SGDClassifier as SGD sgd_params = { 'alpha': [0.00006, 0.00007, 0.00008, 0.0001, 0.0005] } # Regularization parameter model_SGD = GridSearchCV( SGD(random_state=0, shuffle=True, loss='modified_huber'), sgd_params, scoring='roc_auc', cv=20) # Find out which regularization parameter works the best. model_SGD.fit(train_x, label) # Fit the model. print "SGD效果" print(model_SGD.grid_scores_) # 分别输出 LR、MNB、SGD 的结果 # LR_result = model_LR.predict_proba(test_x)[:,1] # We only need the probabilities that the movie review was a 7 or greater. LR_result = model_LR.predict( test_x ) # We only need the probabilities that the movie review was a 7 or greater. LR_output = pd.DataFrame(data={ "id": test["id"],
# In[17]: columns = ['GoB' , 'text' , 'final_rule'] X = result.drop(columns,axis = 1) y = result['GoB'].values # # Создаем модельки # In[53]: from sklearn import cross_validation, grid_search, linear_model, metrics from sklearn.linear_model import SGDClassifier as SGD classifier = SGD() classifier parameters_grid = { 'loss' : ['log'], 'penalty' : ['l1'], 'n_iter' : [2000], 'alpha' : [0.000775], 'learning_rate' : ['optimal'], 'eta0' : [0.0001], } # In[54]: train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(X, y,
def SGD_regression(self): sgd = SGD(eta0=0.1, learning_rate='adaptive') sgd.fit(self.train_X, self.train_y) self.y_pre_train = sgd.predict(self.train_X) self.y_pre_test = sgd.predict(self.test_X)
def main(): global count, path_csv, test_size path_csv = '' random_shuffle_id = 23 for file_csv in l_csv: book = xlwt.Workbook(encoding="utf-8") start = datetime.now() folds = [1, 2] #, 3, 4, 5, 7, 23, 66, 123, 2018] for fold in folds: message = "Sheet " + str(fold) sheet1 = book.add_sheet(message) SIZE = (1 - test_size) * split_train_test( test_size, 1, fold, 0, random_shuffle_id, file_csv, path_csv) count = -1 for col in range( 1, 2 ): #we could increase the second argument of range, in case that more we would like to run the experiment again for the same fold with different shuffle e.g. 5x2 evaluation print '***********file*********** = ', file_csv print '***********col************ = ', col print '***********fold*********** = ', fold print 'SIZE of L + U = ', int(SIZE) print myspace = np.linspace(int(0.05 * SIZE), int(0.25 * SIZE) + 1, 3) learners = [ SGD(loss='log'), SGD(loss='modified_huber'), SGD(loss='log', penalty='l1'), SGD(loss='log', penalty='elasticnet'), SGD(loss='modified_huber', penalty='l1'), SGD(loss='modified_huber', penalty='elasticnet') ] for lea in learners: counter_j = -1 counter_jj = -1 count = count + 1 my_clf = lea print str(my_clf)[0:str(my_clf).find('(')] + '(' + str( my_clf)[str(my_clf).find('loss'):str(my_clf). find(',', str(my_clf).find('loss'))] + ' , ' + str( my_clf )[str(my_clf).find('penalty'):str(my_clf). find(',', str(my_clf).find('penalty'))] + ')' for j in myspace: j = int(round(j)) counter_j = counter_j + 1 n_labeled = j # number of samples that are initially labeled print '**** Labeled instances = ', j metrics = ['lc', 'entropy', 'sm', 'random'] for jj in metrics: trn_ds, tst_ds, y_train, fully_labeled_trn_ds, initial_instances = split_train_test( test_size, n_labeled, fold, random_shuffle_id, col, file_csv, path_csv) trn_ds2 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) train_data = int(initial_instances - initial_instances * test_size) quota = len( y_train ) - n_labeled # number of samples to query # Comparing UncertaintySampling strategy with RandomSampling. counter_jj = counter_jj + 1 if jj != 'random': print '**** Metric of Uncertainty Sampling strategy = ', jj qs1 = UncertaintySampling( trn_ds, kernel=jj, model=SklearnProbaAdapter(my_clf)) model = SklearnProbaAdapter(my_clf) E_out_1, ttt, trn_ds_returned, aa, bb = run( trn_ds, tst_ds, lbr, model, qs1, quota, j) else: print '**** Baseline Sampling strategy = ', jj qs1 = RandomSampling( trn_ds, model=SklearnProbaAdapter(my_clf)) model = SklearnProbaAdapter(my_clf) E_out_1, ttt, trn_ds_returned, aa, bb = run( trn_ds, tst_ds, lbr, model, qs1, quota, j) if count != 0: down_cells = len(E_out_1) + 9 else: down_cells = 0 i = 8 + down_cells * count sheet1.write(i - 7, counter_jj + counter_j, jj) # metric of incertaintly sheet1.write(i - 6, counter_jj + counter_j, quota) # amount of U sheet1.write(i - 5, counter_jj + counter_j, aa) # instanes inserted per iteration sheet1.write(i - 4, counter_jj + counter_j, bb) # amount of L sheet1.write( i - 3, counter_jj + counter_j, trn_ds_returned.len_labeled() ) # amount of training data after active learning procedure sheet1.write( i - 2, counter_jj + counter_j, trn_ds_returned.len_unlabeled() ) # amount of unlabeled instances after active learning procedure sheet1.write( i - 8, counter_jj + counter_j, str(my_clf)[0:str(my_clf).find('(')] + '(' + str(my_clf)[str(my_clf).find('loss'):str( my_clf).find(',', str(my_clf).find('loss'))] + ' , ' + str(my_clf) [str(my_clf).find('penalty'):str(my_clf). find(',', str(my_clf).find('penalty'))] + ')') for n in E_out_1: sheet1.write(i, counter_jj + counter_j, n) i = i + 1 #print 'error in last iteration: ', E_out_1[-1] print print("> Compilation Time : %s", (datetime.now() - start).total_seconds()) print("AIAIexperiment_" + file_csv[0:-4] + ".xls") book.save("AIAIexperimetn_" + file_csv[0:-4] + "_incremental_" + str(fold) + ".xls") times_l.append((datetime.now() - start).total_seconds())
# clf.fit(np.abs(fv_train), label_train) # prob_train = np.delete(clf.predict_proba(fv_train),1, axis=1).flatten() # prob_test = np.delete(clf.predict_proba(fv_test) ,1, axis=1).flatten() # score_train = np.append(score_train, [prob_train], axis=0) # score_test = np.append(score_test , [prob_test] , axis=0) # print 'execution time:{:.2f}s'.format(T.time()-tStart) #============================================================================== ########## SGD tStart = T.time() M = 'SGD' print 'FV_{}\tmix={}\tSGD'.format(FV, MIXTURE) for ALPHA in [0.0001]: for i, PENALTY in enumerate(['l2', 'l1', 'elasticnet']): for LR in ['optimal']: #,'constant','invscaling']: model.append(FV + '_' + M + str(PENALTY)) clf = SGD(penalty=PENALTY, alpha=ALPHA, learning_rate=LR) #, class_weight='balanced') clf.fit(fv_train, label_train) prob_train = clf.decision_function(fv_train) prob_test = clf.decision_function(fv_test) if (i == 0): # and (FV=='gen'): score_train = [prob_train] score_test = [prob_test] else: score_train = np.append(score_train, [prob_train], axis=0) score_test = np.append(score_test, [prob_test], axis=0) ### score_train = np.transpose(score_train) score_test = np.transpose(score_test) joblib.dump( score_train, FV_pth + 'score/' + str(MIXTURE) + '_' + FV + '_' +
#Combine both to fit the TFIDF vectorization. allData = trainData + testData lenTrain = len(trainData) tfv.fit(allData) allData = tfv.transform(allData) #Separate back into training and dev sets. train = allData[:lenTrain] test = allData[lenTrain:] #Regularization parameter sgdParams = {'alpha': [0.00006, 0.00007, 0.00008, 0.0001, 0.0005]} #Find out which regularization parameter works the best. modelSGD = GridSearchCV(SGD(random_state=0, shuffle=True, loss='modified_huber'), sgdParams, scoring='roc_auc', cv=20) #Fit the model. modelSGD.fit(train, trainSet['sentiment']) SGDResult = modelSGD.predict_proba(test)[:, 1] SGDOutput = pd.DataFrame(data={ "id": testSet['id'], "review": testSet['review'], "sentiment": SGDResult }) SGDOutput.to_excel("Result.xlsx", sheet_name='Result', index=False) print(modelSGD.best_score_)
NaiveBayes = MNB() NaiveBayes.fit(X_train, Y_train) print("Training acc:", NaiveBayes.score(X_train, Y_train), "\nValidation acc:", NaiveBayes.score(X_val, Y_val)) # Results: # Training acc: 0.99 # Validation acc: 0.9325 # SGD In[ ]: sgd = SGD(max_iter=5, random_state=0,loss='modified_huber',n_jobs=4) sgd.fit(X_train, Y_train) print("Training acc:", sgd.score(X_train, Y_train), "\nValidation acc:", sgd.score(X_val, Y_val)) # Results: # Training acc: 0.995 # Validation acc: 0.88 # In[ ]: parameters = {'alpha': [0.1, 0.5, 1, 1.5]} sgd_search = GridSearchCV(sgd,parameters , scoring='roc_auc', cv=20) sgd_search.fit(X_train, Y_train) print("The best parameters: " + str(sgd_search.best_params_)) # The best parameters: {'alpha': 0.1}
from sklearn.linear_model import SGDClassifier as SGD import matplotlib.pyplot as plt import numpy as np x=[[0,0],[1,1],[2,2],[3,3]] y=[0,1,2,3] clf = SGD(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, tol=None, verbose=0, warm_start=False) clf.fit(x,y) print(clf.predict([[4,4]])) print(clf.coef_) print(clf.intercept_) print(clf.decision_function([[2,2]]))
ngram_range=(1, 2)) # This is an example of our parameter grid that we would use for searching # It was found that sometimes having a really large parameter grid took too long # to be useful (example 700+ fits would take more than an 3 hours running on 8 cores) param_grid_2 = [{'vect__max_df': [.75, .5, .1, .075]}] # This is our pipeline to train the model with # We use a TfidfVectorizer and a Stochastic Gradient Descent Classifier # More information can be found in the write-up and on the GitHub README lr_tfidf = Pipeline([('vect', tfidf), ('clf', SGD(loss='modified_huber', alpha=0.00015, n_iter=np.ceil(10**6 / len(df['review'])), l1_ratio=0.05, penalty='l2', shuffle=False, learning_rate='optimal'))]) # Beginning of example for how we used GridSearchCV to find parameters # This is only here to showcase how we found parameters - it would use parameter_grid2 from above # gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid_2, # scoring='accuracy', # cv=5, # verbose=1, # n_jobs=6) # how many cores to run on - this gets all of them # # # # gs_lr_tfidf.fit(X_train, y_train)