def run_online_classifier(): vect = HashingVectorizer( decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer_streaming, ) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) csv_filename = os.path.join('datasets', 'movie_data.csv') doc_stream = stream_docs(path=csv_filename) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if X_train is None: break else: X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print("Test accuracy: %.3f" % clf.score(X_test, y_test)) clf = clf.partial_fit(X_test, y_test)
def predict_sgd(X_train, y_train, X_test, sample_weight): clf = SGDClassifier(loss='log', alpha=0.01, l1_ratio=0, n_jobs=2, n_iter=50) clf.fit(X_train, y_train, sample_weight=sample_weight) predictions = clf.predict_proba(X_test) return predictions
def add_sgd_class(self, word, example): self.clfColor = SGDClassifier(loss="log", penalty="l2") self.clfShape = SGDClassifier(loss="log", penalty="l2") X_Color = [example['Color']] y_Color = [word] X_Shape = [example['Shape']] y_Shape = [word] for word in self.knownWords.keys(): for classifier in self.knownWords[word]: if("Synonym" not in str(type(classifier))): examples = classifier.positiveExamples for ex in examples : if("Color" in classifier._type_): X_Color.append(ex['Color']) y_Color.append(word) if("Shape" in classifier._type_): X_Shape.append(ex['Shape']) y_Shape.append(word) classes = np.unique(y_Color) self.clfColor.partial_fit(X_Color, y_Color,classes=classes) self.classColors = classes classes = np.unique(y_Shape) self.clfShape.partial_fit(X_Shape, y_Shape,classes=classes) self.classShapes = classes
def plot_sgd_separator(): # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function([x1, x2]) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' ax = plt.axes() ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) ax.axis('tight')
def sgd_classifier(V_train, y_train, V_val, y_val, V_test, y_test): t0 = time.time() print 'Building Random Forest model' clf = SGDClassifier(n_iter = 50) #clf = grid_search.GridSearchCV(svm_clf, parameters) clf.fit(V_train, y_train) #print clf.best_params_ t1 = time.time() print 'Building Random Forest model ... Done', str(int((t1 - t0)*100)/100.) print '' p_val =clf.predict(V_val) print 'Training accuracy on validation set', accuracy_score(y_val, p_val) p_test = clf.predict(V_test) print 'Accuracy on testing set' print classification_report(y_test, p_test)
def test_underflow_or_overlow(): with np.errstate(all="raise"): # Generate some weird data with hugely unscaled features rng = np.random.RandomState(0) n_samples = 100 n_features = 10 X = rng.normal(size=(n_samples, n_features)) X[:, :2] *= 1e300 assert_true(np.isfinite(X).all()) # Use MinMaxScaler to scale the data without introducing a numerical # instability (computing the standard deviation naively is not possible # on this data) X_scaled = MinMaxScaler().fit_transform(X) assert_true(np.isfinite(X_scaled).all()) # Define a ground truth on the scaled data ground_truth = rng.normal(size=n_features) y = (np.dot(X_scaled, ground_truth) > 0.0).astype(np.int32) assert_array_equal(np.unique(y), [0, 1]) model = SGDClassifier(alpha=0.1, loss="squared_hinge", n_iter=500) # smoke test: model is stable on scaled data model.fit(X_scaled, y) assert_true(np.isfinite(model.coef_).all()) # model is numerically unstable on unscaled data msg_regxp = ( r"Floating-point under-/overflow occurred at epoch #.*" " Scaling input data with StandardScaler or MinMaxScaler" " might help." ) assert_raises_regexp(ValueError, msg_regxp, model.fit, X, y)
def main(): """ Generates features and fits classifier. """ featureIndexes = processData(os.path.join(dataFolder, "avito_train.tsv"), itemsLimit=300000) trainFeatures,trainTargets, trainItemIds=processData(os.path.join(dataFolder,"avito_train.tsv"), featureIndexes, itemsLimit=300000) testFeatures, testItemIds=processData(os.path.join(dataFolder,"avito_test.tsv"), featureIndexes) joblib.dump((trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds), os.path.join(dataFolder,"train_data.pkl")) trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(os.path.join(dataFolder,"train_data.pkl")) logging.info("Feature preparation done, fitting model...") clf = SGDClassifier( loss="log", penalty="l2", alpha=1e-4, class_weight="auto") clf.fit(trainFeatures,trainTargets) logging.info("Predicting...") predicted_scores = clf.predict_proba(testFeatures).T[1] logging.info("Write results...") output_file = "avito_starter_solution.csv" logging.info("Writing submission to %s" % output_file) f = open(os.path.join(dataFolder,output_file), "w") f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): f.write("%d\n" % (item_id)) f.close() logging.info("Done.")
def test_create_model(self): print("labeled sentence worked?") x_train = labelizeReviews(self.xTrain, 'TRAIN') x_test = labelizeReviews(self.xTest, 'TEST') model_dm = gensim.models.Doc2Vec(min_count=1, window=5, size=self.size, sample=1e-3, negative=5, workers=3) model_dbow = gensim.models.Doc2Vec(min_count=1, window=6, size=self.size, sample=1e-3, negative=5, dm=0, workers=3) sentences = x_train model_dm.build_vocab(sentences) model_dbow.build_vocab(sentences) # npArray = np.array(x_train) for epoch in range(10): print("Starting epoch:", str(epoch)) # perm = np.random.permutation(npArray.shape[0]) model_dm.train(random.sample(sentences, len(sentences))) model_dbow.train(random.sample(sentences, len(sentences))) # model_dm.train(x_train) train_vecs = getVecs(model_dm, x_train, self.size) train_vecs_dbow = getVecs(model_dbow, x_train, self.size) train_vecs_total = np.hstack((train_vecs, train_vecs_dbow)) sentences = x_test for epoch in range(10): print("Starting epoch:", str(epoch)) # perm = np.random.permutation(npArray.shape[0]) model_dm.train(random.sample(sentences, len(sentences))) model_dbow.train(random.sample(sentences, len(sentences))) test_vecs = getVecs(model_dm, x_train, self.size) test_vecs_dbow = getVecs(model_dbow, x_train, self.size) test_vecs_total = np.hstack((test_vecs, test_vecs_dbow)) lr = SGDClassifier(loss='log', penalty='l1') lr.fit(train_vecs_total, self.labelsTrain[:self.samples]) print('Test Accuracy: %.2f'%lr.score(test_vecs_total, self.labelsTest[:self.samples]))
def classify(dummy_train,dummy_test,feature_pkl,output_file): # Train classifier, iterating over subsets # Load Features print 'Loading features...' featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl) trainTargets = np.array(trainTargets) testItemIds = np.array(testItemIds) predicted_ids = [] predicted_scores = [] # SGD Logistic Regression per sample clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=5, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) for col in range(np.shape(dummy_train)[1]): # Get nonzero dummy indices as array idx_train = dummy_train[:,col].astype('bool').T.toarray()[0] print 'Training subset {} of {}...'.format(col,np.shape(dummy_train)[1]) sub_train = normalize(trainFeatures.tocsr()[idx_train,:], norm='l2', axis=0) clf.fit(sub_train,trainTargets[idx_train]) # Use probabilities instead of binary class prediction in order to generate a ranking idx_test = dummy_test[:,col].astype('bool').T.toarray()[0] sub_test = normalize(testFeatures.tocsr()[idx_test,:], norm='l2', axis=0) predicted_scores += clf.predict_proba(sub_test).T[1].tolist() predicted_ids += testItemIds[idx_test].tolist() with open(os.path.splitext(feature_pkl)[0]+'_'+output_file,'w') as out_fid: out_fid.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, predicted_ids), reverse = True): # only writes item_id per output spec, but may want to look at predicted_scores out_fid.write("%d\n" % (item_id))
class LightModel: def __init__(self,learningRate, numEpochs, ppenalty="l1", mustShuffle=True): #Init scikit models self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle) def train(self, gen, v=False): i = 0 for x, y in gen: #For each batch self.Classifier.partial_fit(x, y, [0,1]) i += len(x) if v : print(str(datetime.now())[:-7] , "example:", i) def test(self, gen, v=False): #init target and prediction arrays ytot = np.array([]) ptot = np.array([]) #Get prediction for each batch i = 0 for x,y in gen: p = self.Classifier.predict_proba(x) p = p.T[1].T #Keep column corresponding to probability of class 1 #Stack target and prediction for later analysis ytot = np.hstack((ytot, y)) ptot = np.hstack((ptot, p)) i += y.shape[0] if v : print(str(datetime.now())[:-7] , "example:", i) if v: print("Score:", self.score(ytot, ptot)) return (ytot, ptot) def score(self, target, prediction): return llfun(target, prediction)
def predict_domains_for_documents(test_domain=CORE_DOMAINS[0], avg=True): X, y, vectorizer = _get_study_level_X_y(test_domain=test_domain) score_f = lambda y_true, y_pred: metrics.precision_recall_fscore_support( y_true, y_pred, average=None ) # , average="macro") # score_f = sklearn.metrics.f1_score # note that asarray call below, which seems necessary for # reasons that escape me (see here # https://github.com/scikit-learn/scikit-learn/issues/2508) clf = SGDClassifier(loss="hinge", penalty="l2", alpha=0.01) # pdb.set_trace() cv_res = cross_validation.cross_val_score( clf, X, np.asarray(y), score_func=score_f, # sklearn.metrics.precision_recall_fscore_support, cv=5, ) # pdb.set_trace() if avg: cv_res = sum(cv_res) / float(cv_res.shape[0]) # metrics.precision_recall_fscore_support # if dump_output: # np.savetxt(test_domain.replace(" ", "_") + ".csv", cv_res, delimiter=',', fmt='%2.2f') print cv_res ### train on all model = clf.fit(X, y) informative_features = show_most_informative_features(vectorizer, model, n=50) return (cv_res, informative_features, y)
def classify_reviews(): import featurizer import gen_training_data import numpy as np from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier data = gen_training_data.gen_data(); stemmed_data = featurizer.stem(data); tfidf= featurizer.tfidf(data); clf = MultinomialNB().fit(tfidf['train_tfidf'], data['training_labels']); predicted = clf.predict(tfidf['test_tfidf']); num_wrong = 0; tot = 0; for expected, guessed in zip(data['testing_labels'], predicted): if(expected-guessed != 0): num_wrong += 1; print("num_wrong: %d",num_wrong) sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42); _ = sgd_clf.fit(tfidf['train_tfidf'], data['training_labels']); sgd_pred = sgd_clf.predict(tfidf['test_tfidf']); print np.mean(sgd_pred == data['testing_labels']); stem_tfidf = featurizer.tfidf(stemmed_data); _ = sgd_clf.fit(stem_tfidf['train_tfidf'], data['training_labels']); sgd_stem_prd = sgd_clf.predict(stem_tfidf['test_tfidf']); print np.mean(sgd_stem_prd==data['testing_labels']);
def buildModel(size): with open('Sentiment Analysis Dataset.csv', 'rb') as csvfile: pos_tweets =[] neg_tweets =[] spamreader = csv.reader(csvfile, delimiter=',') for row in spamreader: if row[1] == '1': if not (len(pos_tweets) > size): pos_tweets.append(_cleanTweet(row[3])) else: if not (len(neg_tweets) > size): neg_tweets.append(_cleanTweet(row[3])) y = np.concatenate((np.ones(len(pos_tweets[0:size])), np.zeros(len(neg_tweets[0:size])))) x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets[0:size], neg_tweets[0:size])), y, test_size=0.2) x_train = _cleanText(x_train) x_test = _cleanText(x_test) n_dim = 100 #Initialize model and build vocab imdb_w2v = Word2Vec(size=n_dim, min_count=10) imdb_w2v.build_vocab(x_train) imdb_w2v.train(x_train) train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train]) train_vecs = scale(train_vecs) #Train word2vec on test tweets imdb_w2v.train(x_test) #Build test tweet vectors then scale test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test]) test_vecs = scale(test_vecs) lr = SGDClassifier(loss='log', penalty='l1') lr.fit(train_vecs, y_train) imdb_w2v.save("imdb_w2v") f = open("Accuracy.txt","w") f.write(str(lr.score(test_vecs, y_test))+" "+str(size*2)) f.close()
def do_classify(): corpus = MyCorpus() # tfidf_model = TfidfModel(corpus) corpus_idf = tfidf_model[corpus] # corpus_lsi = lsi_model[corpus_idf] num_terms = len(corpus.dictionary) # num_terms = 400 corpus_sparse = matutils.corpus2csc(corpus_idf, num_terms).transpose(copy=False) # print corpus_sparse.shape # corpus_dense = matutils.corpus2dense(corpus_idf, len(corpus.dictionary)) # print corpus_dense.shape penalty = "l2" clf = SGDClassifier(loss="hinge", penalty=penalty, alpha=0.0001, n_iter=50, fit_intercept=True) # clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3) y = np.array(corpus.cls_y) # print y.shape clf.fit(corpus_sparse, y) filename = os.path.join(HERE, "sgdc_clf.pkl") _ = joblib.dump(clf, filename, compress=9) print "train completely" X_test = [] X_label = [] for obj in SogouCorpus.objects.filter(id__in=corpus.test_y): X_test.append(obj.tokens) X_label.append(cls_ids[obj.classify]) # result = classifier.predict(obj.tokens) test_corpus = [dictionary.doc2bow(s.split(",")) for s in X_test] test_corpus = tfidf_model[test_corpus] test_corpus = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False) pred = clf.predict(test_corpus) score = metrics.f1_score(X_label, pred) print ("f1-score: %0.3f" % score)
def test_transformer(transformer, data_set, configuration): clf = SGDClassifier(alpha=0.005) samples = [] labels = range(10) for epoch in range(configuration.hyper_parameters.epochs): for index, sample in enumerate(transformer.compute_outputs(data_set.trainset[0], data_set.trainset[1], 1)): samples.append(sample.reshape((1, sample.shape[0]))) if index % 10 == 9: clf.partial_fit(samples, labels, labels) samples = [] gc.collect() error = 0 count = 0 test_predictions = [] for index, sample in enumerate(transformer.compute_outputs(data_set.testset[0], data_set.testset[1], 1)): prediction = clf.predict(sample) if not prediction == index % 10: error += 1 count += 1 test_predictions.append(prediction) OutputLog().write('test predictions weight: {0}'.format(test_predictions)) OutputLog().write('\nerror: %f%%\n' % error)
def SGD(x, y): #Using Stochastic Gradient Descent of Sklearn from sklearn.linear_model import SGDClassifier clf = SGDClassifier() clf.fit(x, y) return clf.predict(x)
class twoclass(SGDClassifier): # THE HACK IS NOW GETTING EVEN MORE EVIL def __init__(self): self.clazz= SGDClassifier(loss='log') def fit(self,X,y, crossval=False): if crossval: print "layers crossvalscore:",sklearn.model_selection.cross_val_score(SGDClassifier(loss='log'),X, y).mean() self.clazz.fit(X,y) self.intercept_= self.clazz.intercept_ self.classes_= self.clazz.classes_ return self # eden cant annotate two classes if the esti is not a sgdregressor # -> this hack is made! ''' details: decission function returns a one d array. eden only accepts these if the estimater is instance of sgdregressor. so i make a two d array from my 1 d array. if i hack something like this in the future maybe the intercept array needs to be provided.. (see the annotator code) ''' # default guy: #def decision_function(self, vector): # answer = super(self.__class__,self).decision_function(vector) # return np.vstack((answer, (answer-1))).T def decision_function(self,vector): return self.clazz.predict_proba(vector) '''
class SGDRanker(BaseEstimator): """ Ranking predictor using stochastic gradient descent TODO: -allow configurable parameters for classifier -seed random state """ def __init__(self, seconds=10): self.clf = SGDClassifier(loss='hinge') self.clf.fit_intercept = False self.clf.classes_ = np.array([-1, 1]) self.seconds = seconds def fit(self, X, y): rows = X.shape[0] start_time = time.time() for i in itertools.count(): if time.time() - start_time > self.seconds: return self idx1 = random.randint(0, rows - 1) idx2 = random.randint(0, rows - 1) y1, y2 = y[idx1], y[idx2] if y1 == y2: continue self.clf.partial_fit(X[idx1] - X[idx2], np.sign(y1 - y2)) def predict(self, X): return np.dot(X, self.clf.coef_.T)
def main(date): """ Runs linear regression (classification) between the herbicide resistance classes based on all wavelengths. The weights associated with each wavelength are then plotted, allowing the user to see the contribution to classification by each wavelength. :param date: (string) Data collection date YYYY_MMDD :return: (None) """ # Load the training data from disk X, y = FileIO.loadTrainingData(date) X = np.nan_to_num(X) # Train the classifier on the loaded data clf = SGDClassifier() clf.fit(X, y) # Plot the feature weights to visualize feature contributions featureWeights = np.fabs(clf.coef_) for i in xrange(3): plt.plot(WAVELENGTHS, featureWeights[i]) plt.title("Linear Classifier Weights for " + RESISTANCE_STRINGS[INDEX_TO_LABEL[i]] + " vs Others") plt.xlabel("Wavelength (nm)") plt.ylabel("Absolute Weight") plt.show()
def train_model(t): # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) x_train, y_train = split_x_y(t) return clf.fit(x_train, y_train)
def train_vectorized(feats, Y, model_path=None, grid=False): # Vectorize labels labels = [ labels_map[y] for y in Y ] Y = np.array( labels ) # Vectorize feature dictionary vec = DictVectorizer() X = vec.fit_transform(feats) norm_mat( X , axis=0 , copy=False) # Grid Search if grid: print 'Performing Grid Search' clf = do_grid_search(X, Y) else: #clf = LinearSVC(C=0.1, class_weight='auto') #clf = LogisticRegression(C=0.1, class_weight='auto') clf = SGDClassifier(penalty='elasticnet',alpha=0.001, l1_ratio=0.85, n_iter=1000,class_weight='auto') clf.fit(X, Y) # Save model if model_path: with open(model_path+'.dict' , 'wb') as f: pickle.dump(vec, f) with open(model_path+'.model', 'wb') as f: pickle.dump(clf, f) # return model return vec, clf
def runSGDPipeline(entries, langs): t0 = time() sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42))]) vect = CountVectorizer(ngram_range=(1,1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.fit_transform(X_train_counts) clf = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time()-t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return sgd_pipeline
def train(docs, labels, regu=1, bg_weight=.1): ''' :param docs: iterator of (title, body) pairs :param labels: integer labels for docs (0 is weakly-negative) :return: model ''' num_topics=50 feas = map(extract_words, docs) labels = np.array(list(labels), dtype=int) idf=train_idf(feas) X,vocab=extract_feas(feas, idf) #lda=train_lda(X, vocab, num_topics) #X=transform_lda(X, lda) # set up sample weights weights = balance_weights(labels, bg_weight) labels=labels.copy() labels[labels == 0] = 1 model=SGDClassifier(loss='log', alpha=regu/len(labels), fit_intercept=True, n_iter=100, shuffle=True) model.fit(X, labels, sample_weight=weights) #print accuracy(labels, model.predict(X)) return dict(idf=idf, logreg=model, lda=None)
class kernelsvm(): def __init__(self, theta0, alpha, loss_metric): self.theta0 = theta0 self.alpha = alpha self.loss_metric = loss_metric def fit(self, X, y, idx_SR): n_SR = len(idx_SR) self.feature_map_nystroem = General_Nystroem(kernel='rbf', gamma=self.theta0, n_components=n_SR) X_features = self.feature_map_nystroem.fit_transform(X,idx_SR) print("fitting SGD") self.clf = SGDClassifier(loss=self.loss_metric,alpha=self.alpha) self.clf.fit(X_features, y) print("fitting SGD finished") def predict(self, X): print("Predicting") X_transform = self.feature_map_nystroem.transform(X) return self.clf.predict(X_transform), X_transform def decision_function(self, X): # X should be the transformed input! return self.clf.decision_function(X) def err_rate(self, y_true, y_pred): acc = accuracy_score(y_true, y_pred) err_rate = 1.0-acc return err_rate def get_params(self): return self.clf.get_params()
def validate(): """ Runs a 10-fold cross validation on the classifier, reporting accuracy. """ trainDf = pd.read_csv("../NewData/train.csv") X = np.matrix(pd.DataFrame(trainDf, index=None, columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco", "user_pop", "frnd_infl", "evt_pop"])) y = np.array(trainDf.interested) nrows = len(trainDf) kfold = KFold(nrows, 10) avgAccuracy = 0 run = 0 for train, test in kfold: Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test] clf = SGDClassifier(loss="log", penalty="l2") clf.fit(Xtrain, ytrain) accuracy = 0 ntest = len(ytest) for i in range(0, ntest): yt = clf.predict(Xtest[i, :]) if yt == ytest[i]: accuracy += 1 accuracy = accuracy / ntest print "accuracy (run %d): %f" % (run, accuracy) avgAccuracy += accuracy run += 1 print "Average accuracy", (avgAccuracy / run)
def run_SGD(X, y, n_tr, n_te): X_tr, y_tr, X_te, y_te = X[:n_tr], y[:n_tr], X[-n_te:], y[-n_te:] penalties = ['hinge', 'log'] for p in penalties: model = SGDClassifier(loss=p, penalty=None, n_iter=100).fit(X_tr, y_tr) print 'Training, validation accuracy is %6.4f and %6.4f for %s loss' % \ (model.score(X_tr, y_tr), model.score(X_te, y_te), p)
def plot_sgd_classifier(num_samples, clt_std): #generation of data X, y = make_blobs(n_samples=num_samples, centers=2, cluster_std=clt_std) #fitting of data using logistic regression clf = SGDClassifier(loss='log', alpha=0.01) clf.fit(X, y) #plotting of data x_ = np.linspace(min(X[:, 0]), max(X[:, 0]), 10) y_ = np.linspace(min(X[:, 1]), max(X[:, 1]), 10) X_, Y_ = np.meshgrid(x_, y_) Z = np.empty(X_.shape) for (i, j), val in np.ndenumerate(X_): x1 = val x2 = Y_[i, j] conf_score = clf.decision_function([x1, x2]) Z[i, j] = conf_score[0] levels = [-1.0, 0, 1.0] colors = 'k' linestyles = ['dashed', 'solid', 'dashed'] ax = plt.axes() plt.xlabel('X1') plt.ylabel('X2') ax.contour(X_, Y_, Z, colors=colors, levels=levels, linestyles=linestyles, labels='Boundary') ax.scatter(X[:, 0], X[:, 1], c=y)
def train_stochaticGradientDescent(X, y, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False): clf = SGDClassifier(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, n_iter=n_iter, shuffle=shuffle, verbose=verbose, epsilon=epsilon, n_jobs=n_jobs, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, class_weight=class_weight, warm_start=warm_start, average=average ) clf = clf.fit(X,y) return clf
def stochasticGD(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans stochasticGD split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Stochastic Gradient Descent " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"Stochastic_GD_metrics_test.txt" file = open(results, "w") file.write("Stochastic Gradient Descent estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Stochastic Gradient Descent %f"%test_size save = Output + "Stochastic_GD_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save)
def crossvalidate(feas, labels, param): labels = np.array(list(labels), dtype=int) accs = [] for train_ids, valid_ids in StratifiedKFold(labels, 10): idf=train_idf([feas[i] for i in train_ids]) X,vocab=extract_feas(feas, idf) #lda=train_lda(X, vocab, num_topics) #X=transform_lda(X, lda) labels_train = labels[train_ids].copy() weights = balance_weights(labels_train, param['bg_weight']) labels_train[labels_train == 0] = 1 model=SGDClassifier(loss='log', alpha=param['regu']/len(labels_train), fit_intercept=True, shuffle=True, n_iter=50) model.fit(X[train_ids], labels_train, sample_weight=weights) pp = model.predict_proba(X[valid_ids]) pred_labels = np.argmax(pp, 1) pred_labels = model.classes_[pred_labels] #a=accuracy(labels[valid_ids], pred_labels, 1) # return all scores for "good" class assert model.classes_[1] == 2 pred_scores = pp[:,1] a=avg_precision(labels[valid_ids], pred_scores) print '%.2f' % a, accs.append(a) return np.mean(accs)
ap = argparse.ArgumentParser() ap.add_argument("-d", "--dataset", required=True, help="path to input dataset") args = vars(ap.parse_args()) # Get list of image paths image_paths = list(paths.list_images(args['dataset'])) # initilize the image preprocessor, load the dataset from disk. # and reshape the data matrix sp = SimplePreprocessor(32, 32) sdl = SimpleDatasetLoader(preprocessors=[sp]) (data, labels) = sdl.load(imagePaths, verbose=500) data = data.reshape((data.shape[0], 3072)) le = LabelEncoder() labels = le.fit_transform(labels) (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, random_state=5) # loop over our set of regularizaers for r in (None, "l1", "l2"): print("[INFO] training model with '{}' penalty".format(r)) model = SGDClassifier(loss="log", penalty=r, max_iter=10, learning_rate="constant", eta0=0.01, random_state=42) model.fit(trainX, trainY) # eveluate the classifier acc = model.score(testX, testY) print("[INFO] '{}' penalty accuracy:{:.2}%".format(r, acc * 100))
def main(input_file): batch_size = 128 nb_classes = 62 # A-Z, a-z and 0-9 nb_epoch = 2 # Input image dimensions img_rows, img_cols = 32, 32 # Path of data files path = input_file def convert_(Y): alpha = string.letters dig = string.digits alphaList = [] for elem in (alpha + dig): alphaList.append(elem) list_ = [] for elem in Y: for i in range(0, elem.shape[0]): if elem[i] == 1: list_.append(i) list_ = np.asarray(list_) return list_ # Load the preprocessed data and labels X_train_all = np.load(path + "/trainPreproc_" + str(img_rows) + "_" + str(img_cols) + ".npy") Y_train_all = np.load(path + "/labelsPreproc.npy") X_train, X_val, Y_train, Y_val = \ train_test_split(X_train_all, Y_train_all, test_size=0.25, stratify=np.argmax(Y_train_all, axis=1)) print X_train.shape labels = convert_(Y_train) validation = convert_(Y_val) X_train = X_train.reshape( (X_train.shape[0], X_train.shape[2] * X_train.shape[3])) X_val = X_val.reshape((X_val.shape[0], X_val.shape[2] * X_val.shape[3])) print 'Training and Testing...' clf_rf = RandomForestClassifier() clf_rf.fit(X_train, labels) y_pred_rf = clf_rf.predict(X_val) acD_rf = accuracy_score(validation, y_pred_rf) print "random forest accuracy: ", acD_rf clf_sgd = SGDClassifier() clf_sgd.fit(X_train, labels) y_pred_sgd = clf_sgd.predict(X_val) acD_sgd = accuracy_score(validation, y_pred_sgd) print "stochastic gradient descent accuracy: ", acD_sgd clf_svm = LinearSVC() clf_svm.fit(X_train, labels) y_pred_svm = clf_svm.predict(X_val) acD_svm = accuracy_score(validation, y_pred_svm) print "Linear SVM accuracy: ", acD_svm clf_knn = KNeighborsClassifier() clf_knn.fit(X_train, labels) y_pred_knn = clf_knn.predict(X_val) acD_knn = accuracy_score(validation, y_pred_knn) print "nearest neighbors accuracy: ", acD_knn clf_nn = DBN([X_train.shape[1], 300, 62], learn_rates=0.0240, learn_rate_decays=0.9, epochs=130) clf_nn.fit(X_train, labels) acD_nn = clf_nn.score(X_val, validation) print "neural network accuracy: ", acD_nn clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) clf.fit(X_train, labels) acD_nn = clf.score(X_val, validation) print "naive bayes: ", acD_nn clf = BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) clf.fit(X_train, labels) acD_nn = clf.score(X_val, validation) print "bernulli naive bayes: ", acD_nn
else: print 'No training data, please re-enter\n' sys.exit('Program exit') if test is not None: ingredientMatrix, testIDs = createMatrix(test, allIngredients) sgdtest, sgdtestIDs = sgdreadTest(test) else: print 'No test data, please re-enter\n' sys.exit('Program exit') secondLayerInput = firstLayerReader(m1, m2, m3, m4) labels = labelReader(tlabel) #Train the grid search classifier clf = SGDClassifier() parameters = { # 'vect__max_df': (0.5, 0.75, 1.0), #'vect__max_features': (None, 5000, 10000, 50000), # 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams # 'tfidf__use_idf': (True, False), # 'tfidf__norm': ('l1', 'l2'), 'alpha': (0.00001, 0.000001), 'penalty': ('l2', 'elasticnet', 'l1'), 'n_iter': (10, 50), } grid_search = GridSearchCV(clf, parameters, n_jobs=-1, verbose=1) grid_search.fit(secondLayerInput, labels) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:")
def eval(final_emb, labels, splits, random_state=42, clf=['mlp', 'sgd', 'lr', 'svm']): scaler = StandardScaler() X_train = [] y_train = [] X_val = [] y_val = [] X_test = [] y_test = [] for node, emb in final_emb.items(): if splits[node] == 1: X_train.append(emb) y_train.append(labels[node]) elif splits[node] == 2: X_val.append(emb) y_val.append(labels[node]) elif splits[node] == 3: X_test.append(emb) y_test.append(labels[node]) X_train = np.stack(X_train) y_train = np.array(y_train) X_val = np.stack(X_val) y_val = np.array(y_val) X_test = np.stack(X_test) y_test = np.array(y_test) scaler.fit(np.vstack([X_train, X_val, X_test])) X_train = scaler.transform(X_train) X_val = scaler.transform(X_val) X_test = scaler.transform(X_test) if 'mlp' in clf: print("MLPClassifier") lr = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(64, ), max_iter=5000) lr.fit(X_train, y_train) print(lr.score(X_train, y_train)) print(lr.score(X_val, y_val)) print(lr.score(X_test, y_test)) if 'lr' in clf: print("LogisticRegression") lr = LogisticRegression(multi_class='multinomial', max_iter=5000) lr.fit(X_train, y_train) print(lr.score(X_train, y_train)) print(lr.score(X_val, y_val)) print(lr.score(X_test, y_test)) if 'sgd' in clf: print("SGDClassifier") lr = SGDClassifier(max_iter=5000, tol=1e-3) lr.fit(X_train, y_train) print(lr.score(X_train, y_train)) print(lr.score(X_val, y_val)) print(lr.score(X_test, y_test)) if 'svm' in clf: print("SVC") lr = SVC(gamma='auto', max_iter=5000) lr.fit(X_train, y_train) print(lr.score(X_train, y_train)) print(lr.score(X_val, y_val)) print(lr.score(X_test, y_test)) if 'kmean' in clf: X = np.vstack([X_train, X_val, X_test]) y = np.concatenate([y_train, y_val, y_test]) kmean_eval(X, y)
100) save_classifier = open( "PICKLE FILES/pickled_algos_LogisticRegression_classifier5k.pickle", "wb") pickle.dump(LogisticRegression_classifier, save_classifier) save_classifier.close() LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100) save_classifier = open( "PICKLE FILES/pickled_algos_LinearSVC_classifier5k.pickle", "wb") pickle.dump(LinearSVC_classifier, save_classifier) save_classifier.close() ##NuSVC_classifier = SklearnClassifier(NuSVC()) ##NuSVC_classifier.train(training_set) ##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) SGDC_classifier = SklearnClassifier(SGDClassifier()) SGDC_classifier.train(training_set) print("SGDClassifier accuracy percent:", nltk.classify.accuracy(SGDC_classifier, testing_set) * 100) save_classifier = open("PICKLE FILES/pickled_algos_SGDC_classifier5k.pickle", "wb") pickle.dump(SGDC_classifier, save_classifier) save_classifier.close()
# G. Richards 2016, based on sgd_separator.py by Jake Vanderplas import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import SGDClassifier from sklearn.datasets.samples_generator import make_blobs # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] #p = clf.decision_function([x1, x2]) p = clf.decision_function(np.array([x1, x2]).reshape(1, -1)) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' #ax = plt.axes()
float(matchObj.group(4)) ]) info_list = np.concatenate((info_list, new_value), axis=0) new_price = np.matrix([float(matchObj.group(5))]) price_matrix = np.concatenate((price_matrix, new_price), axis=0) house_list.append(house) X = info_list y = price_matrix.transpose() list_y = np.array(y)[0].tolist() from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X) # Don't cheat - fit only on training dataxs clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(X, list_y) SGDClassifier(alpha=0.01, average=True, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=1000, n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
pipe = Pipeline(steps = [('scaler', MinMaxScaler()), ('clf', LogisticRegression(random_state = 42))]) from sklearn.pipeline import make_pipeline pipe = make_pipeline(MinMaxScaler(), LogisticRegression(random_state = 42, max_iter=1000)) pipe.fit(X_train, y_train) accuracy = pipe.score(X_test, y_test) accuracy pipe.get_params() # Exercise from sklearn.linear_model import SGDClassifier pipe = make_pipeline(StandardScaler(), SGDClassifier(max_iter = 1000)) pipe.fit(X_train_b, y_train_b) y_pred = pipe.predict(X_test_b) accuracy = balanced_accuracy_score(y_test_b, y_pred) accuracy # interactions and polynomials from sklearn.datasets import load_boston from sklearn.preprocessing import PolynomialFeatures boston = load_boston() X_train2, X_test2, y_train2, y_test2 = train_test_split(boston.data, boston.target, random_state =0) # 75/25% sacler = MinMaxScaler() X_train2_scaled = scaler.fit_transform(X_train2) X_test2_scaled = scaler.transform(X_test2)
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier from sklearn.model_selection import GridSearchCV from fit_tune_function import fit_tune_store_sgdcv clf_pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) parameters = { 'vect__ngram_range': [ (1, 1), (1, 2), (1, 3), (1, 4), ], 'tfidf__use_idf': (True, False), 'clf__random_state': (0, ), 'clf__alpha': ( 1e-2, 1e-3, 1e-4, 0.1, 1e-6, ), 'clf__max_iter': (2, 5, 10, 20, 100, 200),
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.linear_model import SGDClassifier, Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.linear_model import LogisticRegression heldout = [0.95, 0.90, 0.75, 0.50, 0.01] rounds = 20 digits = datasets.load_digits() X, y = digits.data, digits.target classifiers = [("SGD", SGDClassifier(max_iter=100, tol=1e-3)), ("ASGD", SGDClassifier(average=True, max_iter=100, tol=1e-3)), ("Perceptron", Perceptron(tol=1e-3)), ("Passive-Aggressive I", PassiveAggressiveClassifier(loss='hinge', C=1.0, tol=1e-4)), ("Passive-Aggressive II", PassiveAggressiveClassifier(loss='squared_hinge', C=1.0, tol=1e-4)), ("SAG", LogisticRegression(solver='sag', tol=1e-1, C=1.e4 / X.shape[0]))] xx = 1. - np.array(heldout) for name, clf in classifiers:
from sklearn.linear_model import SGDClassifier X = [[0., 0.], [1., 1.]] y = [0, 1] clf = SGDClassifier(loss="log", penalty="l2") classifier = clf.fit(X, y) X1 = [[0.01, 0.02], [1.5, 1.5]] res = classifier.predict(X1) print res X_p1 = [[0.01, 0.02], [1.5, 1.5]] y_p1 = [1, 1] classifier.partial_fit(X_p1, y_p1) res2 = classifier.predict(X1) print res2
import numpy as np from time import time import scipy.stats as stats from sklearn.utils.fixes import loguniform from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.datasets import load_digits from sklearn.linear_model import SGDClassifier # get some data X, y = load_digits(return_X_y=True) # build a classifier clf = SGDClassifier(loss='hinge', penalty='elasticnet', fit_intercept=True) # Utility function to report best scores def report(results, n_top=3): for i in range(1, n_top + 1): candidates = np.flatnonzero(results['rank_test_score'] == i) for candidate in candidates: print("Model with rank: {0}".format(i)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate])) print("") # specify parameters and distributions to sample from
norm="l2", tokenizer=lambda x: x.split(), sublinear_tf=False, ngram_range=(1, 3)) x_train_multilabel = vectorizer.fit_transform(x_train['question']) x_test_multilabel = vectorizer.transform(x_test['question']) print("Time taken to run this cell :", datetime.now() - start) print("Dimensions of train data X:", x_train_multilabel.shape, "Y :", y_train.shape) print("Dimensions of test data X:", x_test_multilabel.shape, "Y:", y_test.shape) # This function is compute heavy and takes 6-7 hours to run. classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1', n_jobs=-1), n_jobs=-1) classifier.fit(x_train_multilabel, y_train) predictions = classifier.predict(x_test_multilabel) print("accuracy :", metrics.accuracy_score(y_test, predictions)) print("macro f1 score :", metrics.f1_score(y_test, predictions, average='macro')) print("micro f1 scoore :", metrics.f1_score(y_test, predictions, average='micro')) print("hamming loss :", metrics.hamming_loss(y_test, predictions)) print("Precision recall report :\n", metrics.classification_report(y_test, predictions))
# SVM clf_svm = SVC(gamma='scale', random_state=random_state) # GB clf_gb = GradientBoostingClassifier(random_state=random_state, loss='deviance', learning_rate=0.025, n_estimators=200) # KNN clf_knn = KNeighborsClassifier(n_neighbors=number_of_neighbors) # SGD clf_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=1000, random_state=random_state, tol=None) def classify(X, y, clf): y = y.astype('int') model = ExtraTreesClassifier() model.fit(X, y) print(model.feature_importances_) feat_importances = pd.Series(model.feature_importances_, index=X.columns) feat_importances.nlargest(10).plot(kind='barh') plt.show() # get correlations of each features in dataset corrmat = X.corr()
def run_models(features_name, model_list, best_model, X_train, X_test, y_train, y_test, random_state=42): # Set random state random_state = random_state # Convenience translation dictionary for printing model_dict = { 'lr': 'Logistic Regression', 'sgd': 'Stochastic Gradient Descent', 'rf': 'Random Forest', 'dnn': 'Dense Neural Network' } # Dictionary of pre-determined hyperparameters for models hyperparams_dict = { 'tfidf': { 'lr': { 'C': 30.0, 'class_weight': 'None', 'solver': 'newton-cg' }, 'sgd': { 'tol': 1e-3, 'max_iter': 1000, 'penalty': 'l1' }, 'rf': { 'bootstrap': False, 'n_estimators': 200, 'max_depth': 35, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, } }, 'doc2vec': { 'lr': { 'C': 0.01, 'class_weight': 'balanced', 'solver': 'sag' }, 'sgd': { 'tol': 1e-3, 'max_iter': 1000, 'penalty': 'l1' }, 'rf': { 'bootstrap': True, 'n_estimators': 230, 'max_depth': 35, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, } } } # Iterate over model_list for model_type in model_list: # Logistic Regression fit model if model_type == 'lr': clf = LogisticRegression( C=hyperparams_dict[features_name][model_type]['C'], class_weight=hyperparams_dict[features_name][model_type] ['class_weight'], solver=hyperparams_dict[features_name][model_type]['solver'], n_jobs=-1, random_state=random_state) elif model_type == 'sgd': clf = SGDClassifier( tol=hyperparams_dict[features_name][model_type]['tol'], max_iter=hyperparams_dict[features_name][model_type] ['max_iter'], penalty=hyperparams_dict[features_name][model_type]['penalty'], n_jobs=-1, random_state=random_state) elif model_type == 'rf': clf = RandomForestClassifier( max_features=hyperparams_dict[features_name][model_type] ['max_features'], min_samples_leaf=hyperparams_dict[features_name][model_type] ['min_samples_leaf'], n_estimators=hyperparams_dict[features_name][model_type] ['n_estimators'], bootstrap=hyperparams_dict[features_name][model_type] ['bootstrap'], min_samples_split=hyperparams_dict[features_name][model_type] ['min_samples_split'], max_depth=hyperparams_dict[features_name][model_type] ['max_depth'], n_jobs=-1, random_state=random_state) else: raise ValueError("No model type provided") # Fit classifier print('{} - {}'.format(features_name, model_dict[model_type])) clf.fit(X_train, y_train) # predictions and evaluations predicted = clf.predict(X_test) accuracy = evaluate_model(predicted, y_test) # Update best performing model if necessary if accuracy > best_model['accuracy']: best_model['accuracy'] = accuracy best_model['model'] = clf best_model['type'] = model_type best_model['predictions'] = predicted best_model['features'] = features_name # Return best model and type return best_model
def train_SGD(training_set): SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) return SGDClassifier_classifier.train(training_set)
import numpy as np from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier from sklearn.feature_selection import VarianceThreshold from sklearn.preprocessing import StandardScaler pipeline = Pipeline(steps=[( 'select', VarianceThreshold()), ('standardize', StandardScaler()), ('classify', SGDClassifier())]) param_grid = { 'classify__random_state': [0], 'classify__class_weight': ['balanced'], 'classify__loss': ['log'], 'classify__penalty': ['elasticnet'], 'classify__alpha': 10.0**np.linspace(-3, 1, 10), 'classify__l1_ratio': [0.15], } grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, scoring='roc_auc')
docs.append(text) y.append(label) except StopIteration: return None, None return docs, y from sklearn.feature_extraction.text import HashingVectorizer from sklearn.linear_model import SGDClassifier vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer_) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) doc_stream = stream_docs(path='./movie_data.csv') pbar = pyprind.ProgBar(45) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) pbar.update() X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print("Accuracy: {:.3f}".format(clf.score(X_test, y_test)))
brand_labels_test = labels[1] model_labels_train = labels[2] model_labels_test = labels[3] del labels brand_labels_train = brands_onehot.transform(brand_labels_train).toarray() brand_labels_test = brands_onehot.transform(brand_labels_test).toarray() model_labels_train = models_onehot.transform(model_labels_train).toarray() model_labels_test = models_onehot.transform(model_labels_test).toarray() # train and tune 1st order model parameters via grid search classifier_brands = SGDClassifier( loss = 'log', penalty = 'elasticnet', random_state = 0, verbose = 1, n_jobs = 3) classifier_models = SGDClassifier( loss = 'log', penalty = 'elasticnet', random_state = 0, verbose = 1, n_jobs = 3) regressor_brands = SGDRegressor( loss = 'squared_loss', penalty = 'elasticnet', random_state = 0, verbose = 1) regressor_models = SGDRegressor( loss = 'squared_loss',
spamwriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL,skipinitialspace=True) spamwriter.writerow([str(c),str3,'0']) file.close() c+=1 break train = pd.read_csv("imdb_tr.csv", header=0,delimiter=",",encoding='utf-8') vectorizer1 = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,ngram_range=(1,1)) vectorizer2 = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,ngram_range=(1,2)) vectorizer3 = TfidfVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,ngram_range=(1,1)) vectorizer4 = TfidfVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,ngram_range=(1,2)) train_data_features1 = vectorizer1.fit_transform(train["text"]) train_data_features2 = vectorizer2.fit_transform(train["text"]) train_data_features3 = vectorizer3.fit_transform(train["text"]) train_data_features4 = vectorizer4.fit_transform(train["text"]) clf1 = SGDClassifier(loss="hinge", penalty="l1") clf1.fit(train_data_features1, train["polarity"]) clf2 = SGDClassifier(loss="hinge", penalty="l1") clf2.fit(train_data_features2, train["polarity"]) clf3 = SGDClassifier(loss="hinge", penalty="l1") clf3.fit(train_data_features3, train["polarity"]) clf4 = SGDClassifier(loss="hinge", penalty="l1") clf4.fit(train_data_features4, train["polarity"]) test = pd.read_csv("../resource/asnlib/public/imdb_te.csv",encoding='latin-1',header=0) clean_test=[] for i in range(len(test['text'])): arr1=[w for w in re.split('\W', test['text'][i]) if w] str2=" ".join(str(x) for x in arr1) str2=str2.lower() str3 = ' '.join([word for word in str2.split() if word not in stopArr])
# sklearn有一套很成熟的管道流程Pipeline,快速搭建机器学习模型神器 bayes_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()) ]) bayes_clf.fit(x_train, y_train) """ Predict the test dataset using Naive Bayes""" predicted = bayes_clf.predict(x_test) print('Naive Bayes correct prediction: {:4.4f}'.format(np.mean(predicted == y_test))) # 输出f1分数,准确率,召回率等指标 print(metrics.classification_report(y_test, predicted, target_names=categories)) """ Support Vector Machine (SVM) classifier""" svm_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)), ]) svm_clf.fit(x_train, y_train) predicted = svm_clf.predict(x_test) print('SVM correct prediction: {:4.4f}'.format(np.mean(predicted == y_test))) print(metrics.classification_report(y_test, predicted, target_names=categories)) # 输出混淆矩阵 print("Confusion Matrix:") print(metrics.confusion_matrix(y_test, predicted)) print('\n') """ 10-折交叉验证 """ clf_b = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB()) clf_s = make_pipeline(CountVectorizer(), TfidfTransformer(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))
for cuisine in train_labels.values(): if cuisine not in label_map: label_map[cuisine] = label_int label_map[label_int] = cuisine label_int += 1 # pprint(label_map) for k, v in train_labels.items(): train_labels[k] = label_map[v] # train and predict using Tfidf counts and svm if vectorizer == 'count': pipe_svm = Pipeline([('vectorizer', CountVectorizer()), ('classifier', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]) elif vectorizer == 'tfidf': pipe_svm = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]) training_data = [] training_labels = [] for data, label in zip(train_data.values(), train_labels.values()): training_data.append(data)
time_new_whole = time.time() - start time_new = times_sgd[i - 1] + time_new times_sgd.append(time_new) accuracy = accuracy_test(X_test, y_test, weights) accuracies.append(accuracy) # print(weights.shape) i += 1 #SGDClassifier sgd_best = SGDClassifier(loss='log', penalty='none', tol=0.0, fit_intercept=False, eta0=0.01, learning_rate='constant') param_range = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # param_range = range(1, 10) times = [] train_scores = [] test_scores = [] for iteration in param_range: sgd_temp = SGDClassifier(loss='log', penalty='none', tol=0.000001, eta0=0.01, learning_rate='constant',
# Версия 5, всё таки multi-label пробуем за час до окончания from sklearn.model_selection import train_test_split from sklearn.preprocessing import MultiLabelBinarizer y_multilabel = MultiLabelBinarizer().fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y_multilabel, test_size=0.4, random_state=0) print("Train:", len(y_train)) print("Test:", len(y_test)) print("Overall:", len(y_multilabel)) from sklearn.linear_model import SGDClassifier from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier sgd = SGDClassifier(random_state=42) #loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None) lr = LogisticRegression() clf = OneVsRestClassifier(lr) from skmultilearn.problem_transform import BinaryRelevance from sklearn.svm import SVC # clf = BinaryRelevance(classifier=SVC(kernel='linear', C=minC, random_state=241), require_dense=[False, True]) clf.fit(X_train, y_train) y_pred = clf.predict(X_test[1]) print(y_pred[0]) print("Время обучения модели: ", (time.time() - start_time)) with open(f"{pre_path}finalmodel{model_version}.pkl", 'wb') as f:
def test_mutli_output_classifiation_partial_fit_no_first_classes_exception(): sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) assert_raises_regex( ValueError, "classes must be passed on the first call " "to partial_fit.", multi_target_linear.partial_fit, X, y)
from mylib.plotdregion import plot_decision_region if __name__ == '__main__': iris = datasets.load_iris() X = iris.data[:, [2, 3]] y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) # ml = Perceptron(eta0=0.01, max_iter=40, tol=0, random_state=0) # ml = LogisticRegression(C=1000.0, random_state=0) # ml = SVC(kernel='linear', C=1.0, random_state=0) # ml = SGDClassifier(loss = 'perceptron') # ml = SGDClassifier(loss='log') ml = SGDClassifier(loss='hinge') ml.fit(X_train_std, y_train) y_pred = ml.predict(X_test_std) print('총 테스트 개수: %d, 오류개수:%d' % (len(y_test), (y_test != y_pred).sum())) print('정확도: %.2f'%accuracy_score(y_test, y_pred)) X_combined_std = np.vstack((X_train_std, X_test_std)) y_combined = np.hstack((y_train, y_test)) plot_decision_region(X=X_combined_std, y=y_combined, classifier=ml, test_idx=range(105, 150), title='scikit-learn SVM')
(KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "RF")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append( benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty print('=' * 80) print("Elastic-Net penalty") results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes")
warnings.filterwarnings("ignore") TDATA = pd.read_csv('Data.csv') X_train, x_test, y_train, y_test = train_test_split(TDATA.COMBINED, TDATA.FLAIR, test_size=0.3, random_state=7) LSVM = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, random_state=42, max_iter=150, tol=None))]) LSVM.fit(X_train, y_train) REDDIT = praw.Reddit(user_agent='redditflairdetector', client_id='PY-6WwMrA9O48Q', client_secret='rwwa13TTlmWYSeD8D9_kW13r6UE') SUBREDDIT = REDDIT.subreddit('india') SAVED_MODEL = pickle.dumps(LSVM) LOADED_MODEL = pickle.loads(SAVED_MODEL) print(
parser.add_argument('test', type=load_npz, help='Test features (npz)') parser.add_argument('output', help='Output label predictions (npz)') return parser if __name__ == "__main__": args = opts().parse_args() print "Loading and preparing data" X = prepare_features(args.train) scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) Y = args.labels['labels'] print "Training classifier" clfs = [SGDClassifier(loss='log') for y in Y.T] for clf, y in zip(clfs, Y.T): try: clf.fit(X, y) except: pass del X, Y print "Predicting" X = scaler.transform(prepare_features(args.test)) p = [] for clf in clfs: try: p.append(clf.predict_proba(X)[:, 0]) except: p.append(np.zeros(len(X)))
def define_clfs_params(): ''' Defines all relevant parameters and classes for classfier objects. ''' clfs = { 'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1), 'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'), 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=[1, 5, 10, 15]), algorithm="SAMME", n_estimators=200), 'LR': LogisticRegression(penalty='l1', C=1e5), 'SVM': svm.SVC(kernel='linear', probability=True, random_state=0), 'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'NB': GaussianNB(), 'DT': DecisionTreeClassifier(), 'SGD': SGDClassifier(loss='log', penalty='l2'), 'KNN': KNeighborsClassifier(n_neighbors=3) } params = { 'RF': { 'n_estimators': [1, 10, 100, 1000], 'max_depth': [10, 15, 20, 30, 40, 50, 60, 70, 100], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10], 'random_state': [1] }, 'LR': { 'penalty': ['l1', 'l2'], 'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'random_state': [1] }, 'SGD': { 'loss': ['log', 'perceptron'], 'penalty': ['l2', 'l1', 'elasticnet'], 'random_state': [1] }, 'ET': { 'n_estimators': [1, 10, 100, 1000], 'criterion': ['gini', 'entropy'], 'max_depth': [1, 3, 5, 10, 15], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10], 'random_state': [1] }, 'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1, 10, 100, 1000], 'random_state': [1] }, 'GB': { 'n_estimators': [1, 10, 100, 1000], 'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5], 'subsample': [0.1, 0.5, 1.0], 'max_depth': [1, 3, 5, 10, 20, 50, 100], 'random_state': [1] }, 'NB': {}, 'DT': { 'criterion': ['gini', 'entropy'], 'max_depth': [15, 20, 30, 40, 50], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10], 'random_state': [1] }, 'SVM': { 'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear'], 'random_state': [1] }, 'KNN': { 'n_neighbors': [1, 5, 10, 25, 50, 100], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree'] } } return clfs, params