コード例 #1
1
def run_online_classifier():
    vect = HashingVectorizer(
        decode_error='ignore',
        n_features=2**21,
        preprocessor=None,
        tokenizer=tokenizer_streaming,
    )
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)

    csv_filename = os.path.join('datasets', 'movie_data.csv')
    doc_stream = stream_docs(path=csv_filename)

    classes = np.array([0, 1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if X_train is None:
            break
        else:
            X_train = vect.transform(X_train)
            clf.partial_fit(X_train, y_train, classes=classes)

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print("Test accuracy: %.3f" % clf.score(X_test, y_test))

    clf = clf.partial_fit(X_test, y_test)
コード例 #2
0
ファイル: utils.py プロジェクト: seylom/kaggle-hashtags
def predict_sgd(X_train, y_train, X_test, sample_weight):
    clf = SGDClassifier(loss='log', alpha=0.01, l1_ratio=0, n_jobs=2,
                        n_iter=50)
    clf.fit(X_train, y_train, sample_weight=sample_weight)

    predictions = clf.predict_proba(X_test)
    return predictions
コード例 #3
0
ファイル: jmFramework.py プロジェクト: iral-lab/jmlearning
 def add_sgd_class(self, word, example):
    self.clfColor = SGDClassifier(loss="log", penalty="l2")
    self.clfShape = SGDClassifier(loss="log", penalty="l2")
    X_Color = [example['Color']]
    y_Color = [word]
    X_Shape = [example['Shape']]
    y_Shape = [word]
    for word in self.knownWords.keys():
       for classifier in self.knownWords[word]:
          if("Synonym" not in str(type(classifier))):
             examples = classifier.positiveExamples
             for ex in examples : 
                if("Color" in classifier._type_):
                   X_Color.append(ex['Color'])
                   y_Color.append(word)
                if("Shape" in classifier._type_):
                   X_Shape.append(ex['Shape'])
                   y_Shape.append(word)
    
    classes = np.unique(y_Color)
    self.clfColor.partial_fit(X_Color, y_Color,classes=classes)
    self.classColors = classes
    classes = np.unique(y_Shape)
    self.clfShape.partial_fit(X_Shape, y_Shape,classes=classes)
    self.classShapes = classes
コード例 #4
0
def plot_sgd_separator():
    # we create 50 separable points
    X, Y = make_blobs(n_samples=50, centers=2,
                      random_state=0, cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge", alpha=0.01,
                        n_iter=200, fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function([x1, x2])
        Z[i, j] = p[0]
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

    ax.axis('tight')
コード例 #5
0
ファイル: MLlib.py プロジェクト: HACP/RHETORICS
def sgd_classifier(V_train, y_train, V_val, y_val, V_test, y_test):

    t0 = time.time()

    print 'Building Random Forest model'

    clf = SGDClassifier(n_iter = 50)

    #clf = grid_search.GridSearchCV(svm_clf, parameters)                                                                                                                            

    clf.fit(V_train, y_train)

    #print clf.best_params_                                                                                                                                                         

    t1 = time.time()
    print 'Building Random Forest model ... Done', str(int((t1 - t0)*100)/100.)
    print ''

    p_val =clf.predict(V_val)

    print 'Training accuracy on validation set', accuracy_score(y_val, p_val)

    p_test = clf.predict(V_test)

    print 'Accuracy on testing set'

    print classification_report(y_test, p_test)
コード例 #6
0
ファイル: test_sgd.py プロジェクト: richlewis42/scikit-learn
def test_underflow_or_overlow():
    with np.errstate(all="raise"):
        # Generate some weird data with hugely unscaled features
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 10

        X = rng.normal(size=(n_samples, n_features))
        X[:, :2] *= 1e300
        assert_true(np.isfinite(X).all())

        # Use MinMaxScaler to scale the data without introducing a numerical
        # instability (computing the standard deviation naively is not possible
        # on this data)
        X_scaled = MinMaxScaler().fit_transform(X)
        assert_true(np.isfinite(X_scaled).all())

        # Define a ground truth on the scaled data
        ground_truth = rng.normal(size=n_features)
        y = (np.dot(X_scaled, ground_truth) > 0.0).astype(np.int32)
        assert_array_equal(np.unique(y), [0, 1])

        model = SGDClassifier(alpha=0.1, loss="squared_hinge", n_iter=500)

        # smoke test: model is stable on scaled data
        model.fit(X_scaled, y)
        assert_true(np.isfinite(model.coef_).all())

        # model is numerically unstable on unscaled data
        msg_regxp = (
            r"Floating-point under-/overflow occurred at epoch #.*"
            " Scaling input data with StandardScaler or MinMaxScaler"
            " might help."
        )
        assert_raises_regexp(ValueError, msg_regxp, model.fit, X, y)
def main():
    """ Generates features and fits classifier. """
    
    featureIndexes = processData(os.path.join(dataFolder, "avito_train.tsv"), itemsLimit=300000)
    trainFeatures,trainTargets, trainItemIds=processData(os.path.join(dataFolder,"avito_train.tsv"), featureIndexes, itemsLimit=300000)
    testFeatures, testItemIds=processData(os.path.join(dataFolder,"avito_test.tsv"), featureIndexes)
    joblib.dump((trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds), os.path.join(dataFolder,"train_data.pkl"))
    trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(os.path.join(dataFolder,"train_data.pkl"))
    logging.info("Feature preparation done, fitting model...")
    clf = SGDClassifier(    loss="log", 
                            penalty="l2", 
                            alpha=1e-4, 
                            class_weight="auto")
    clf.fit(trainFeatures,trainTargets)

    logging.info("Predicting...")
    
    predicted_scores = clf.predict_proba(testFeatures).T[1]

    
    logging.info("Write results...")
    output_file = "avito_starter_solution.csv"
    logging.info("Writing submission to %s" % output_file)
    f = open(os.path.join(dataFolder,output_file), "w")
    f.write("id\n")
    
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
    logging.info("Done.")
コード例 #8
0
    def test_create_model(self):
        print("labeled sentence worked?")
        x_train = labelizeReviews(self.xTrain, 'TRAIN')
        x_test = labelizeReviews(self.xTest, 'TEST')
        model_dm = gensim.models.Doc2Vec(min_count=1, window=5, size=self.size, sample=1e-3, negative=5, workers=3)
        model_dbow = gensim.models.Doc2Vec(min_count=1, window=6, size=self.size, sample=1e-3, negative=5, dm=0, workers=3)
        sentences = x_train
        model_dm.build_vocab(sentences)
        model_dbow.build_vocab(sentences)
        # npArray = np.array(x_train)
        for epoch in range(10):
            print("Starting epoch:", str(epoch))
            # perm = np.random.permutation(npArray.shape[0])
            model_dm.train(random.sample(sentences, len(sentences)))
            model_dbow.train(random.sample(sentences, len(sentences)))
        # model_dm.train(x_train)
        train_vecs = getVecs(model_dm, x_train, self.size)
        train_vecs_dbow = getVecs(model_dbow, x_train, self.size)
        train_vecs_total = np.hstack((train_vecs, train_vecs_dbow))

        sentences = x_test
        for epoch in range(10):
            print("Starting epoch:", str(epoch))
            # perm = np.random.permutation(npArray.shape[0])
            model_dm.train(random.sample(sentences, len(sentences)))
            model_dbow.train(random.sample(sentences, len(sentences)))
        test_vecs = getVecs(model_dm, x_train, self.size)
        test_vecs_dbow = getVecs(model_dbow, x_train, self.size)
        test_vecs_total = np.hstack((test_vecs, test_vecs_dbow))
        lr = SGDClassifier(loss='log', penalty='l1')
        lr.fit(train_vecs_total, self.labelsTrain[:self.samples])

        print('Test Accuracy: %.2f'%lr.score(test_vecs_total, self.labelsTest[:self.samples]))
コード例 #9
0
ファイル: categories.py プロジェクト: phecy/cdips-kaggle
def classify(dummy_train,dummy_test,feature_pkl,output_file):
    # Train classifier, iterating over subsets
    # Load Features
    print 'Loading features...'
    featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl)
    trainTargets = np.array(trainTargets)
    testItemIds = np.array(testItemIds)
    predicted_ids = []
    predicted_scores = []
    # SGD Logistic Regression per sample 
    clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1,
          eta0=0.0, fit_intercept=True, l1_ratio=0.15,
          learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
          penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False,
          verbose=0, warm_start=False)
    for col in range(np.shape(dummy_train)[1]):
        # Get nonzero dummy indices as array
        idx_train = dummy_train[:,col].astype('bool').T.toarray()[0]
        print 'Training subset {} of {}...'.format(col,np.shape(dummy_train)[1])
        sub_train = normalize(trainFeatures.tocsr()[idx_train,:], norm='l2', axis=0)
        clf.fit(sub_train,trainTargets[idx_train])
       # Use probabilities instead of binary class prediction in order to generate a ranking    
        idx_test = dummy_test[:,col].astype('bool').T.toarray()[0]
        sub_test = normalize(testFeatures.tocsr()[idx_test,:], norm='l2', axis=0)
        predicted_scores += clf.predict_proba(sub_test).T[1].tolist()
        predicted_ids += testItemIds[idx_test].tolist()
    
    with open(os.path.splitext(feature_pkl)[0]+'_'+output_file,'w') as out_fid:
        out_fid.write("id\n")
        for pred_score, item_id in sorted(zip(predicted_scores, predicted_ids), reverse = True):
           # only writes item_id per output spec, but may want to look at predicted_scores
            out_fid.write("%d\n" % (item_id))
コード例 #10
0
class LightModel:
    def __init__(self,learningRate, numEpochs, ppenalty="l1", mustShuffle=True):
        #Init scikit models
        self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle)
    def train(self, gen,  v=False):
        i = 0
        for x, y in gen: #For each batch
            self.Classifier.partial_fit(x, y, [0,1])
            i += len(x)
            if v : print(str(datetime.now())[:-7] , "example:", i)
            
    def test(self, gen,  v=False):

        #init target and prediction arrays
        ytot = np.array([])
        ptot = np.array([])
        #Get prediction for each batch
        i = 0
        for x,y in gen:
            p = self.Classifier.predict_proba(x)
            p = p.T[1].T #Keep column corresponding to probability of class 1
            #Stack target and prediction for later analysis
            ytot = np.hstack((ytot, y)) 
            ptot = np.hstack((ptot, p))
            i += y.shape[0]
            if v : print(str(datetime.now())[:-7] , "example:", i)
        if v: print("Score:", self.score(ytot, ptot))
        
        return (ytot, ptot)
    def score(self, target, prediction):
        return llfun(target, prediction)
コード例 #11
0
ファイル: quality.py プロジェクト: rossmounce/cochrane-nlp
def predict_domains_for_documents(test_domain=CORE_DOMAINS[0], avg=True):
    X, y, vectorizer = _get_study_level_X_y(test_domain=test_domain)
    score_f = lambda y_true, y_pred: metrics.precision_recall_fscore_support(
        y_true, y_pred, average=None
    )  # , average="macro")
    # score_f = sklearn.metrics.f1_score

    # note that asarray call below, which seems necessary for
    # reasons that escape me (see here
    # https://github.com/scikit-learn/scikit-learn/issues/2508)

    clf = SGDClassifier(loss="hinge", penalty="l2", alpha=0.01)
    # pdb.set_trace()
    cv_res = cross_validation.cross_val_score(
        clf,
        X,
        np.asarray(y),
        score_func=score_f,
        # sklearn.metrics.precision_recall_fscore_support,
        cv=5,
    )
    # pdb.set_trace()
    if avg:
        cv_res = sum(cv_res) / float(cv_res.shape[0])
    # metrics.precision_recall_fscore_support

    # if dump_output:
    #    np.savetxt(test_domain.replace(" ", "_") + ".csv", cv_res, delimiter=',', fmt='%2.2f')

    print cv_res

    ### train on all
    model = clf.fit(X, y)
    informative_features = show_most_informative_features(vectorizer, model, n=50)
    return (cv_res, informative_features, y)
コード例 #12
0
ファイル: classifier.py プロジェクト: JT17/445Project
def classify_reviews():
	import featurizer
	import gen_training_data
	import numpy as np
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.linear_model import SGDClassifier

	data = gen_training_data.gen_data();
	stemmed_data = featurizer.stem(data);
	tfidf= featurizer.tfidf(data);
	clf = MultinomialNB().fit(tfidf['train_tfidf'], data['training_labels']);
	predicted = clf.predict(tfidf['test_tfidf']);
	num_wrong = 0;
	tot = 0;
	for expected, guessed in zip(data['testing_labels'], predicted):
		if(expected-guessed != 0):	
			num_wrong += 1;

	print("num_wrong: %d",num_wrong)

	sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42);
	_ = sgd_clf.fit(tfidf['train_tfidf'], data['training_labels']);
	sgd_pred = sgd_clf.predict(tfidf['test_tfidf']);
	print np.mean(sgd_pred == data['testing_labels']);

	stem_tfidf = featurizer.tfidf(stemmed_data);
	_ = sgd_clf.fit(stem_tfidf['train_tfidf'], data['training_labels']);
	sgd_stem_prd = sgd_clf.predict(stem_tfidf['test_tfidf']);
	print np.mean(sgd_stem_prd==data['testing_labels']);
def buildModel(size):
	with open('Sentiment Analysis Dataset.csv', 'rb') as csvfile:
		pos_tweets =[]
		neg_tweets =[]
		spamreader = csv.reader(csvfile, delimiter=',')
		for row in spamreader:
			if row[1] == '1':
				if not (len(pos_tweets) > size):
					pos_tweets.append(_cleanTweet(row[3]))
			else:
				if not (len(neg_tweets) > size):
					neg_tweets.append(_cleanTweet(row[3]))
	y = np.concatenate((np.ones(len(pos_tweets[0:size])), np.zeros(len(neg_tweets[0:size]))))
	x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets[0:size], neg_tweets[0:size])), y, test_size=0.2)
	x_train = _cleanText(x_train)
	x_test = _cleanText(x_test)
	n_dim = 100
	#Initialize model and build vocab
	imdb_w2v = Word2Vec(size=n_dim, min_count=10)
	imdb_w2v.build_vocab(x_train)
	imdb_w2v.train(x_train)
	train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train])
	train_vecs = scale(train_vecs)
	#Train word2vec on test tweets
	imdb_w2v.train(x_test)
	#Build test tweet vectors then scale
	test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
	test_vecs = scale(test_vecs)
	lr = SGDClassifier(loss='log', penalty='l1')
	lr.fit(train_vecs, y_train)
	imdb_w2v.save("imdb_w2v")
	f = open("Accuracy.txt","w")
	f.write(str(lr.score(test_vecs, y_test))+" "+str(size*2))
	f.close()
コード例 #14
0
ファイル: summ.py プロジェクト: jannson/Similar
def do_classify():
    corpus = MyCorpus()
    # tfidf_model = TfidfModel(corpus)
    corpus_idf = tfidf_model[corpus]
    # corpus_lsi = lsi_model[corpus_idf]
    num_terms = len(corpus.dictionary)
    # num_terms = 400
    corpus_sparse = matutils.corpus2csc(corpus_idf, num_terms).transpose(copy=False)
    # print corpus_sparse.shape
    # corpus_dense = matutils.corpus2dense(corpus_idf, len(corpus.dictionary))
    # print corpus_dense.shape
    penalty = "l2"
    clf = SGDClassifier(loss="hinge", penalty=penalty, alpha=0.0001, n_iter=50, fit_intercept=True)
    # clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)
    y = np.array(corpus.cls_y)
    # print y.shape
    clf.fit(corpus_sparse, y)
    filename = os.path.join(HERE, "sgdc_clf.pkl")
    _ = joblib.dump(clf, filename, compress=9)
    print "train completely"

    X_test = []
    X_label = []
    for obj in SogouCorpus.objects.filter(id__in=corpus.test_y):
        X_test.append(obj.tokens)
        X_label.append(cls_ids[obj.classify])
        # result = classifier.predict(obj.tokens)
    test_corpus = [dictionary.doc2bow(s.split(",")) for s in X_test]
    test_corpus = tfidf_model[test_corpus]
    test_corpus = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False)
    pred = clf.predict(test_corpus)
    score = metrics.f1_score(X_label, pred)
    print ("f1-score:   %0.3f" % score)
コード例 #15
0
ファイル: classifier.py プロジェクト: aviveise/double_encoder
def test_transformer(transformer, data_set, configuration):

    clf = SGDClassifier(alpha=0.005)
    samples = []
    labels = range(10)
    for epoch in range(configuration.hyper_parameters.epochs):
        for index, sample in enumerate(transformer.compute_outputs(data_set.trainset[0], data_set.trainset[1], 1)):

            samples.append(sample.reshape((1, sample.shape[0])))
            if index % 10 == 9:
                clf.partial_fit(samples, labels, labels)
                samples = []
                gc.collect()

    error = 0
    count = 0
    test_predictions = []
    for index, sample in enumerate(transformer.compute_outputs(data_set.testset[0], data_set.testset[1], 1)):
        prediction = clf.predict(sample)
        if not prediction == index % 10:
            error += 1

        count += 1
        test_predictions.append(prediction)

    OutputLog().write('test predictions weight: {0}'.format(test_predictions))

    OutputLog().write('\nerror: %f%%\n' % error)
コード例 #16
0
def SGD(x, y):
#Using Stochastic Gradient Descent of Sklearn
	from sklearn.linear_model import SGDClassifier
	clf = SGDClassifier()
	clf.fit(x, y)

	return clf.predict(x)
コード例 #17
0
ファイル: annotate.py プロジェクト: fabriziocosta/GraphLearn
class twoclass(SGDClassifier):
    # THE HACK IS NOW GETTING EVEN MORE EVIL
    def __init__(self):
        self.clazz= SGDClassifier(loss='log')

    def fit(self,X,y, crossval=False):

        if crossval:
            print "layers crossvalscore:",sklearn.model_selection.cross_val_score(SGDClassifier(loss='log'),X, y).mean()

        self.clazz.fit(X,y)
        self.intercept_= self.clazz.intercept_
        self.classes_= self.clazz.classes_
        return self

    # eden cant annotate two classes if the esti is not a sgdregressor
    #  -> this hack is made!
    '''
    details: decission function returns a one d array.
    eden only accepts these if the estimater is instance of sgdregressor.
    so i make a two d array from my 1 d array.
    if i hack something like this in the future maybe the intercept array needs to be provided..
    (see the annotator code)
    '''

    # default guy:
    #def decision_function(self, vector):
    #    answer =  super(self.__class__,self).decision_function(vector)
    #    return np.vstack((answer, (answer-1))).T

    def decision_function(self,vector):
        return self.clazz.predict_proba(vector)

    '''
コード例 #18
0
ファイル: ranking.py プロジェクト: diogo149/BooMLet
class SGDRanker(BaseEstimator):

    """ Ranking predictor using stochastic gradient descent

    TODO:
    -allow configurable parameters for classifier
    -seed random state
    """

    def __init__(self, seconds=10):
        self.clf = SGDClassifier(loss='hinge')
        self.clf.fit_intercept = False
        self.clf.classes_ = np.array([-1, 1])
        self.seconds = seconds

    def fit(self, X, y):
        rows = X.shape[0]
        start_time = time.time()
        for i in itertools.count():
            if time.time() - start_time > self.seconds:
                return self
            idx1 = random.randint(0, rows - 1)
            idx2 = random.randint(0, rows - 1)
            y1, y2 = y[idx1], y[idx2]
            if y1 == y2:
                continue
            self.clf.partial_fit(X[idx1] - X[idx2], np.sign(y1 - y2))

    def predict(self, X):
        return np.dot(X, self.clf.coef_.T)
コード例 #19
0
def main(date):
    """
    Runs linear regression (classification) between the herbicide 
    resistance classes based on all wavelengths. The weights
    associated with each wavelength are then plotted, allowing
    the user to see the contribution to classification by each
    wavelength.

    :param date: (string) Data collection date YYYY_MMDD

    :return: (None)
    """
    
    # Load the training data from disk   
    X, y = FileIO.loadTrainingData(date)
    X = np.nan_to_num(X)

    # Train the classifier on the loaded data
    clf = SGDClassifier()
    clf.fit(X, y)

    # Plot the feature weights to visualize feature contributions
    featureWeights = np.fabs(clf.coef_)

    for i in xrange(3):
        plt.plot(WAVELENGTHS, featureWeights[i])
        plt.title("Linear Classifier Weights for " + RESISTANCE_STRINGS[INDEX_TO_LABEL[i]] + " vs Others")
        plt.xlabel("Wavelength (nm)")
        plt.ylabel("Absolute Weight")
        plt.show()
コード例 #20
0
def train_model(t):
    # fit the model
    clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True)

    x_train, y_train = split_x_y(t)

    return clf.fit(x_train, y_train)
コード例 #21
0
def train_vectorized(feats, Y, model_path=None, grid=False):

    # Vectorize labels
    labels = [ labels_map[y] for y in Y ]
    Y = np.array( labels )

    # Vectorize feature dictionary
    vec = DictVectorizer()
    X = vec.fit_transform(feats)
    norm_mat( X , axis=0 , copy=False)

    # Grid Search
    if grid:
        print 'Performing Grid Search'
        clf = do_grid_search(X, Y)
    else:
        #clf = LinearSVC(C=0.1, class_weight='auto')
        #clf = LogisticRegression(C=0.1, class_weight='auto')
        clf = SGDClassifier(penalty='elasticnet',alpha=0.001, l1_ratio=0.85, n_iter=1000,class_weight='auto')
        clf.fit(X, Y)


    # Save model
    if model_path:
        with open(model_path+'.dict' , 'wb') as f:
            pickle.dump(vec, f)

        with open(model_path+'.model', 'wb') as f:
            pickle.dump(clf, f)


    # return model
    return vec, clf
コード例 #22
0
def runSGDPipeline(entries, langs):
	t0 = time()
	sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(loss='squared_hinge', penalty='l2',
                                            alpha=0.001, n_iter=5, random_state=42))])

	vect = CountVectorizer(ngram_range=(1,1), max_features=n_features)
	X_train_counts = vect.fit_transform(entries)
	tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tfidf = tfidf.fit_transform(X_train_counts)

	clf = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42)
	clf.fit(X_train_tfidf, langs)

	X_new_counts = vect.transform(entries)
	X_new_tfidf = tfidf.transform(X_new_counts)
	predicted = clf.predict(X_new_tfidf.toarray())

	print(np.mean(predicted == langs))
	print(metrics.classification_report(langs, predicted, target_names=langs))
	print(metrics.confusion_matrix(langs, predicted))
	print("Took %s seconds." % (time()-t0))
	print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
	return sgd_pipeline
コード例 #23
0
ファイル: model.py プロジェクト: jseppanen/textpile
def train(docs, labels, regu=1, bg_weight=.1):
    '''
    :param docs: iterator of (title, body) pairs
    :param labels: integer labels for docs (0 is weakly-negative)
    :return: model
    '''
    num_topics=50
    feas = map(extract_words,  docs)
    labels = np.array(list(labels), dtype=int)
    idf=train_idf(feas)
    X,vocab=extract_feas(feas, idf)
    #lda=train_lda(X, vocab, num_topics)
    #X=transform_lda(X, lda)
    # set up sample weights
    weights = balance_weights(labels, bg_weight)
    labels=labels.copy()
    labels[labels == 0] = 1
    model=SGDClassifier(loss='log',
                        alpha=regu/len(labels),
                        fit_intercept=True,
                        n_iter=100,
                        shuffle=True)
    model.fit(X, labels, sample_weight=weights)
    #print accuracy(labels, model.predict(X))
    return dict(idf=idf, logreg=model, lda=None)
コード例 #24
0
ファイル: svm.py プロジェクト: Zheng-JIA/kernelsubsampling
class kernelsvm():
    def __init__(self, theta0, alpha, loss_metric):
        self.theta0 = theta0
        self.alpha = alpha
        self.loss_metric = loss_metric
    def fit(self, X, y, idx_SR):
        n_SR = len(idx_SR)
        self.feature_map_nystroem = General_Nystroem(kernel='rbf', gamma=self.theta0, n_components=n_SR)
        X_features = self.feature_map_nystroem.fit_transform(X,idx_SR)
        print("fitting SGD")
        self.clf = SGDClassifier(loss=self.loss_metric,alpha=self.alpha)
        self.clf.fit(X_features, y)
        print("fitting SGD finished")
    def predict(self, X):
        print("Predicting")
        X_transform = self.feature_map_nystroem.transform(X)
        return self.clf.predict(X_transform), X_transform
    def decision_function(self, X):
        # X should be the transformed input!
        return self.clf.decision_function(X)
    def err_rate(self, y_true, y_pred):
        acc = accuracy_score(y_true, y_pred)
        err_rate = 1.0-acc
        return err_rate
    def get_params(self):
        return self.clf.get_params()
コード例 #25
0
ファイル: RecoWeights.py プロジェクト: ChrisBg/mlia-examples
def validate():
  """
  Runs a 10-fold cross validation on the classifier, reporting
  accuracy.
  """
  trainDf = pd.read_csv("../NewData/train.csv")
  X = np.matrix(pd.DataFrame(trainDf, index=None,
    columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco",
    "user_pop", "frnd_infl", "evt_pop"]))
  y = np.array(trainDf.interested)
  nrows = len(trainDf)
  kfold = KFold(nrows, 10)
  avgAccuracy = 0
  run = 0
  for train, test in kfold:
    Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]
    clf = SGDClassifier(loss="log", penalty="l2")
    clf.fit(Xtrain, ytrain)
    accuracy = 0
    ntest = len(ytest)
    for i in range(0, ntest):
      yt = clf.predict(Xtest[i, :])
      if yt == ytest[i]:
        accuracy += 1
    accuracy = accuracy / ntest
    print "accuracy (run %d): %f" % (run, accuracy)
    avgAccuracy += accuracy
    run += 1
  print "Average accuracy", (avgAccuracy / run)
コード例 #26
0
def run_SGD(X, y, n_tr, n_te):
  X_tr, y_tr, X_te, y_te = X[:n_tr], y[:n_tr], X[-n_te:], y[-n_te:]
  penalties = ['hinge', 'log']
  for p in penalties:
    model = SGDClassifier(loss=p, penalty=None, n_iter=100).fit(X_tr, y_tr)
    print 'Training, validation accuracy is %6.4f and %6.4f for %s loss' % \
        (model.score(X_tr, y_tr), model.score(X_te, y_te), p)
コード例 #27
0
def plot_sgd_classifier(num_samples, clt_std):
    #generation of data
    X, y = make_blobs(n_samples=num_samples, centers=2, cluster_std=clt_std)

    #fitting of data using logistic regression
    clf = SGDClassifier(loss='log', alpha=0.01)
    clf.fit(X, y)

    #plotting of data
    x_ = np.linspace(min(X[:, 0]), max(X[:, 0]), 10)
    y_ = np.linspace(min(X[:, 1]), max(X[:, 1]), 10)

    X_, Y_ = np.meshgrid(x_, y_)
    Z = np.empty(X_.shape)

    for (i, j), val in np.ndenumerate(X_):
        x1 = val
        x2 = Y_[i, j]
        conf_score = clf.decision_function([x1, x2])
        Z[i, j] = conf_score[0]

    levels = [-1.0, 0, 1.0]
    colors = 'k'
    linestyles = ['dashed', 'solid', 'dashed']

    ax = plt.axes()
    plt.xlabel('X1')
    plt.ylabel('X2')
    ax.contour(X_, Y_, Z, colors=colors,
               levels=levels, linestyles=linestyles, labels='Boundary')
    ax.scatter(X[:, 0], X[:, 1], c=y)
コード例 #28
0
def train_stochaticGradientDescent(X, y, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15,
                                   fit_intercept=True, n_iter=5, shuffle=True, verbose=0,
                                   epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal',
                                   eta0=0.0, power_t=0.5, class_weight=None, warm_start=False,
                                   average=False):
    clf = SGDClassifier(loss=loss,
                        penalty=penalty,
                        alpha=alpha,
                        l1_ratio=l1_ratio,
                        fit_intercept=fit_intercept,
                        n_iter=n_iter,
                        shuffle=shuffle,
                        verbose=verbose,
                        epsilon=epsilon,
                        n_jobs=n_jobs,
                        random_state=random_state,
                        learning_rate=learning_rate,
                        eta0=eta0,
                        power_t=power_t,
                        class_weight=class_weight,
                        warm_start=warm_start,
                        average=average
                        )
    clf = clf.fit(X,y)
    return clf
def stochasticGD(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans stochasticGD split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = SGDClassifier(loss="hinge", penalty="l2")
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Stochastic Gradient Descent "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"Stochastic_GD_metrics_test.txt"
    file = open(results, "w")
    file.write("Stochastic Gradient Descent estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Stochastic Gradient Descent %f"%test_size
    save = Output + "Stochastic_GD_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
コード例 #30
0
ファイル: model.py プロジェクト: jseppanen/textpile
def crossvalidate(feas, labels, param):
    labels = np.array(list(labels), dtype=int)
    accs = []
    for train_ids, valid_ids in StratifiedKFold(labels, 10):
        idf=train_idf([feas[i] for i in train_ids])
        X,vocab=extract_feas(feas, idf)
        #lda=train_lda(X, vocab, num_topics)
        #X=transform_lda(X, lda)
        labels_train = labels[train_ids].copy()
        weights = balance_weights(labels_train, param['bg_weight'])
        labels_train[labels_train == 0] = 1
        model=SGDClassifier(loss='log',
                            alpha=param['regu']/len(labels_train),
                            fit_intercept=True,
                            shuffle=True, n_iter=50)
        model.fit(X[train_ids], labels_train, sample_weight=weights)
        pp = model.predict_proba(X[valid_ids])
        pred_labels = np.argmax(pp, 1)
        pred_labels = model.classes_[pred_labels]
        #a=accuracy(labels[valid_ids], pred_labels, 1)
        # return all scores for "good" class
        assert model.classes_[1] == 2
        pred_scores = pp[:,1]
        a=avg_precision(labels[valid_ids], pred_scores)
        print '%.2f' % a,
        accs.append(a)
    return np.mean(accs)
コード例 #31
0
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataset", required=True,
                help="path to input dataset")
args = vars(ap.parse_args())

# Get list of image paths
image_paths = list(paths.list_images(args['dataset']))

# initilize the image preprocessor, load the dataset from disk.
# and reshape the data matrix
sp = SimplePreprocessor(32, 32)
sdl = SimpleDatasetLoader(preprocessors=[sp])
(data, labels) = sdl.load(imagePaths, verbose=500)
data = data.reshape((data.shape[0], 3072))

le = LabelEncoder()
labels = le.fit_transform(labels)

(trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, random_state=5)

# loop over our set of regularizaers
for r in (None, "l1", "l2"):
    print("[INFO] training model with '{}' penalty".format(r))
    model = SGDClassifier(loss="log", penalty=r, max_iter=10, learning_rate="constant", eta0=0.01, random_state=42)
    model.fit(trainX, trainY)

    # eveluate the classifier
    acc = model.score(testX, testY)
    print("[INFO] '{}' penalty accuracy:{:.2}%".format(r, acc * 100))

コード例 #32
0
def main(input_file):
    batch_size = 128
    nb_classes = 62  # A-Z, a-z and 0-9
    nb_epoch = 2

    # Input image dimensions
    img_rows, img_cols = 32, 32

    # Path of data files
    path = input_file

    def convert_(Y):

        alpha = string.letters
        dig = string.digits
        alphaList = []
        for elem in (alpha + dig):
            alphaList.append(elem)

        list_ = []
        for elem in Y:
            for i in range(0, elem.shape[0]):
                if elem[i] == 1:
                    list_.append(i)
        list_ = np.asarray(list_)
        return list_

    # Load the preprocessed data and labels
    X_train_all = np.load(path + "/trainPreproc_" + str(img_rows) + "_" +
                          str(img_cols) + ".npy")
    Y_train_all = np.load(path + "/labelsPreproc.npy")

    X_train, X_val, Y_train, Y_val = \
        train_test_split(X_train_all, Y_train_all, test_size=0.25, stratify=np.argmax(Y_train_all, axis=1))

    print X_train.shape

    labels = convert_(Y_train)
    validation = convert_(Y_val)

    X_train = X_train.reshape(
        (X_train.shape[0], X_train.shape[2] * X_train.shape[3]))
    X_val = X_val.reshape((X_val.shape[0], X_val.shape[2] * X_val.shape[3]))

    print 'Training and Testing...'
    clf_rf = RandomForestClassifier()
    clf_rf.fit(X_train, labels)
    y_pred_rf = clf_rf.predict(X_val)
    acD_rf = accuracy_score(validation, y_pred_rf)
    print "random forest accuracy: ", acD_rf

    clf_sgd = SGDClassifier()
    clf_sgd.fit(X_train, labels)
    y_pred_sgd = clf_sgd.predict(X_val)
    acD_sgd = accuracy_score(validation, y_pred_sgd)
    print "stochastic gradient descent accuracy: ", acD_sgd

    clf_svm = LinearSVC()
    clf_svm.fit(X_train, labels)
    y_pred_svm = clf_svm.predict(X_val)
    acD_svm = accuracy_score(validation, y_pred_svm)
    print "Linear SVM accuracy: ", acD_svm

    clf_knn = KNeighborsClassifier()
    clf_knn.fit(X_train, labels)
    y_pred_knn = clf_knn.predict(X_val)
    acD_knn = accuracy_score(validation, y_pred_knn)
    print "nearest neighbors accuracy: ", acD_knn

    clf_nn = DBN([X_train.shape[1], 300, 62],
                 learn_rates=0.0240,
                 learn_rate_decays=0.9,
                 epochs=130)
    clf_nn.fit(X_train, labels)
    acD_nn = clf_nn.score(X_val, validation)
    print "neural network accuracy: ", acD_nn

    clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
    clf.fit(X_train, labels)
    acD_nn = clf.score(X_val, validation)
    print "naive bayes: ", acD_nn

    clf = BernoulliNB(alpha=1.0,
                      binarize=0.0,
                      class_prior=None,
                      fit_prior=True)
    clf.fit(X_train, labels)
    acD_nn = clf.score(X_val, validation)
    print "bernulli naive bayes: ", acD_nn
コード例 #33
0
ファイル: stackingbysgd.py プロジェクト: zzha293/dataMining
else:
    print 'No training data, please re-enter\n'
    sys.exit('Program exit')

if test is not None:
    ingredientMatrix, testIDs = createMatrix(test, allIngredients)
    sgdtest, sgdtestIDs = sgdreadTest(test)
else:
    print 'No test data, please re-enter\n'
    sys.exit('Program exit')

secondLayerInput = firstLayerReader(m1, m2, m3, m4)
labels = labelReader(tlabel)

#Train the grid search classifier
clf = SGDClassifier()
parameters = {
    # 'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    # 'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'alpha': (0.00001, 0.000001),
    'penalty': ('l2', 'elasticnet', 'l1'),
    'n_iter': (10, 50),
}
grid_search = GridSearchCV(clf, parameters, n_jobs=-1, verbose=1)
grid_search.fit(secondLayerInput, labels)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
コード例 #34
0
ファイル: evaluate_100M.py プロジェクト: hoangdzung/baseline
def eval(final_emb,
         labels,
         splits,
         random_state=42,
         clf=['mlp', 'sgd', 'lr', 'svm']):
    scaler = StandardScaler()
    X_train = []
    y_train = []
    X_val = []
    y_val = []
    X_test = []
    y_test = []
    for node, emb in final_emb.items():
        if splits[node] == 1:
            X_train.append(emb)
            y_train.append(labels[node])
        elif splits[node] == 2:
            X_val.append(emb)
            y_val.append(labels[node])
        elif splits[node] == 3:
            X_test.append(emb)
            y_test.append(labels[node])

    X_train = np.stack(X_train)
    y_train = np.array(y_train)
    X_val = np.stack(X_val)
    y_val = np.array(y_val)
    X_test = np.stack(X_test)
    y_test = np.array(y_test)

    scaler.fit(np.vstack([X_train, X_val, X_test]))
    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    if 'mlp' in clf:
        print("MLPClassifier")
        lr = MLPClassifier(alpha=1e-5,
                           hidden_layer_sizes=(64, ),
                           max_iter=5000)
        lr.fit(X_train, y_train)
        print(lr.score(X_train, y_train))
        print(lr.score(X_val, y_val))
        print(lr.score(X_test, y_test))
    if 'lr' in clf:
        print("LogisticRegression")
        lr = LogisticRegression(multi_class='multinomial', max_iter=5000)
        lr.fit(X_train, y_train)
        print(lr.score(X_train, y_train))
        print(lr.score(X_val, y_val))
        print(lr.score(X_test, y_test))
    if 'sgd' in clf:
        print("SGDClassifier")
        lr = SGDClassifier(max_iter=5000, tol=1e-3)
        lr.fit(X_train, y_train)
        print(lr.score(X_train, y_train))
        print(lr.score(X_val, y_val))
        print(lr.score(X_test, y_test))
    if 'svm' in clf:
        print("SVC")
        lr = SVC(gamma='auto', max_iter=5000)
        lr.fit(X_train, y_train)
        print(lr.score(X_train, y_train))
        print(lr.score(X_val, y_val))
        print(lr.score(X_test, y_test))
    if 'kmean' in clf:
        X = np.vstack([X_train, X_val, X_test])
        y = np.concatenate([y_train, y_val, y_test])
        kmean_eval(X, y)
コード例 #35
0
      100)

save_classifier = open(
    "PICKLE FILES/pickled_algos_LogisticRegression_classifier5k.pickle", "wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:",
      (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

save_classifier = open(
    "PICKLE FILES/pickled_algos_LinearSVC_classifier5k.pickle", "wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

##NuSVC_classifier = SklearnClassifier(NuSVC())
##NuSVC_classifier.train(training_set)
##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDClassifier accuracy percent:",
      nltk.classify.accuracy(SGDC_classifier, testing_set) * 100)

save_classifier = open("PICKLE FILES/pickled_algos_SGDC_classifier5k.pickle",
                       "wb")
pickle.dump(SGDC_classifier, save_classifier)
save_classifier.close()
コード例 #36
0
# G. Richards 2016, based on sgd_separator.py by Jake Vanderplas

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.datasets.samples_generator import make_blobs

# we create 50 separable points
X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60)

# fit the model
clf = SGDClassifier(loss="hinge", alpha=0.01, fit_intercept=True)
clf.fit(X, Y)

# plot the line, the points, and the nearest vectors to the plane
xx = np.linspace(-1, 5, 10)
yy = np.linspace(-1, 5, 10)

X1, X2 = np.meshgrid(xx, yy)
Z = np.empty(X1.shape)
for (i, j), val in np.ndenumerate(X1):
    x1 = val
    x2 = X2[i, j]
    #p = clf.decision_function([x1, x2])
    p = clf.decision_function(np.array([x1, x2]).reshape(1, -1))
    Z[i, j] = p[0]
levels = [-1.0, 0.0, 1.0]
linestyles = ['dashed', 'solid', 'dashed']
colors = 'k'

#ax = plt.axes()
コード例 #37
0
ファイル: new.py プロジェクト: JMMyles/NWHacks2016
                float(matchObj.group(4))
            ])
            info_list = np.concatenate((info_list, new_value), axis=0)
            new_price = np.matrix([float(matchObj.group(5))])
            price_matrix = np.concatenate((price_matrix, new_price), axis=0)
            house_list.append(house)

X = info_list
y = price_matrix.transpose()
list_y = np.array(y)[0].tolist()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)  # Don't cheat - fit only on training dataxs

clf = SGDClassifier(loss="hinge", penalty="l2")
clf.fit(X, list_y)
SGDClassifier(alpha=0.01,
              average=True,
              class_weight=None,
              epsilon=0.1,
              eta0=0.0,
              fit_intercept=True,
              l1_ratio=0.15,
              learning_rate='optimal',
              loss='hinge',
              n_iter=1000,
              n_jobs=1,
              penalty='l2',
              power_t=0.5,
              random_state=None,
コード例 #38
0
ファイル: 8.py プロジェクト: Dis-count/Python_practice
pipe = Pipeline(steps = [('scaler', MinMaxScaler()), ('clf', LogisticRegression(random_state = 42))])

from sklearn.pipeline import make_pipeline

pipe = make_pipeline(MinMaxScaler(), LogisticRegression(random_state = 42, max_iter=1000))

pipe.fit(X_train, y_train)

accuracy = pipe.score(X_test, y_test)
accuracy

pipe.get_params()

# Exercise
from sklearn.linear_model import SGDClassifier
pipe = make_pipeline(StandardScaler(), SGDClassifier(max_iter = 1000))
pipe.fit(X_train_b, y_train_b)
y_pred = pipe.predict(X_test_b)
accuracy = balanced_accuracy_score(y_test_b, y_pred)
accuracy

# interactions and polynomials
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures

boston = load_boston()
X_train2, X_test2, y_train2, y_test2 = train_test_split(boston.data, boston.target, random_state =0)  # 75/25%

sacler = MinMaxScaler()
X_train2_scaled = scaler.fit_transform(X_train2)
X_test2_scaled = scaler.transform(X_test2)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

from fit_tune_function import fit_tune_store_sgdcv

clf_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

parameters = {
    'vect__ngram_range': [
        (1, 1),
        (1, 2),
        (1, 3),
        (1, 4),
    ],
    'tfidf__use_idf': (True, False),
    'clf__random_state': (0, ),
    'clf__alpha': (
        1e-2,
        1e-3,
        1e-4,
        0.1,
        1e-6,
    ),
    'clf__max_iter': (2, 5, 10, 20, 100, 200),
コード例 #40
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression

heldout = [0.95, 0.90, 0.75, 0.50, 0.01]
rounds = 20
digits = datasets.load_digits()
X, y = digits.data, digits.target

classifiers = [("SGD", SGDClassifier(max_iter=100, tol=1e-3)),
               ("ASGD", SGDClassifier(average=True, max_iter=100, tol=1e-3)),
               ("Perceptron", Perceptron(tol=1e-3)),
               ("Passive-Aggressive I",
                PassiveAggressiveClassifier(loss='hinge', C=1.0, tol=1e-4)),
               ("Passive-Aggressive II",
                PassiveAggressiveClassifier(loss='squared_hinge',
                                            C=1.0,
                                            tol=1e-4)),
               ("SAG",
                LogisticRegression(solver='sag', tol=1e-1,
                                   C=1.e4 / X.shape[0]))]

xx = 1. - np.array(heldout)

for name, clf in classifiers:
コード例 #41
0
ファイル: test.py プロジェクト: edysuardiyana/olaf
from sklearn.linear_model import SGDClassifier
X = [[0., 0.], [1., 1.]]
y = [0, 1]
clf = SGDClassifier(loss="log", penalty="l2")

classifier = clf.fit(X, y)

X1 = [[0.01, 0.02], [1.5, 1.5]]
res = classifier.predict(X1)

print res

X_p1 = [[0.01, 0.02], [1.5, 1.5]]
y_p1 = [1, 1]

classifier.partial_fit(X_p1, y_p1)

res2 = classifier.predict(X1)
print res2
コード例 #42
0
import numpy as np

from time import time
import scipy.stats as stats
from sklearn.utils.fixes import loguniform

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.linear_model import SGDClassifier

# get some data
X, y = load_digits(return_X_y=True)

# build a classifier
clf = SGDClassifier(loss='hinge', penalty='elasticnet', fit_intercept=True)


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
コード例 #43
0
                             norm="l2",
                             tokenizer=lambda x: x.split(),
                             sublinear_tf=False,
                             ngram_range=(1, 3))
x_train_multilabel = vectorizer.fit_transform(x_train['question'])
x_test_multilabel = vectorizer.transform(x_test['question'])
print("Time taken to run this cell :", datetime.now() - start)

print("Dimensions of train data X:", x_train_multilabel.shape, "Y :",
      y_train.shape)
print("Dimensions of test data X:", x_test_multilabel.shape, "Y:",
      y_test.shape)

# This function is compute heavy and takes 6-7 hours to run.
classifier = OneVsRestClassifier(SGDClassifier(loss='log',
                                               alpha=0.00001,
                                               penalty='l1',
                                               n_jobs=-1),
                                 n_jobs=-1)
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict(x_test_multilabel)

print("accuracy :", metrics.accuracy_score(y_test, predictions))
print("macro f1 score :", metrics.f1_score(y_test,
                                           predictions,
                                           average='macro'))
print("micro f1 scoore :",
      metrics.f1_score(y_test, predictions, average='micro'))
print("hamming loss :", metrics.hamming_loss(y_test, predictions))
print("Precision recall report :\n",
      metrics.classification_report(y_test, predictions))
# SVM
clf_svm = SVC(gamma='scale', random_state=random_state)

# GB
clf_gb = GradientBoostingClassifier(random_state=random_state,
                                    loss='deviance',
                                    learning_rate=0.025,
                                    n_estimators=200)

# KNN
clf_knn = KNeighborsClassifier(n_neighbors=number_of_neighbors)

# SGD
clf_sgd = SGDClassifier(loss='hinge',
                        penalty='l2',
                        max_iter=1000,
                        random_state=random_state,
                        tol=None)


def classify(X, y, clf):
    y = y.astype('int')
    model = ExtraTreesClassifier()
    model.fit(X, y)
    print(model.feature_importances_)
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    feat_importances.nlargest(10).plot(kind='barh')
    plt.show()

    # get correlations of each features in dataset
    corrmat = X.corr()
コード例 #45
0
def run_models(features_name,
               model_list,
               best_model,
               X_train,
               X_test,
               y_train,
               y_test,
               random_state=42):
    # Set random state
    random_state = random_state

    # Convenience translation dictionary for printing
    model_dict = {
        'lr': 'Logistic Regression',
        'sgd': 'Stochastic Gradient Descent',
        'rf': 'Random Forest',
        'dnn': 'Dense Neural Network'
    }

    # Dictionary of pre-determined hyperparameters for models
    hyperparams_dict = {
        'tfidf': {
            'lr': {
                'C': 30.0,
                'class_weight': 'None',
                'solver': 'newton-cg'
            },
            'sgd': {
                'tol': 1e-3,
                'max_iter': 1000,
                'penalty': 'l1'
            },
            'rf': {
                'bootstrap': False,
                'n_estimators': 200,
                'max_depth': 35,
                'max_features': 'sqrt',
                'min_samples_leaf': 1,
                'min_samples_split': 10,
            }
        },
        'doc2vec': {
            'lr': {
                'C': 0.01,
                'class_weight': 'balanced',
                'solver': 'sag'
            },
            'sgd': {
                'tol': 1e-3,
                'max_iter': 1000,
                'penalty': 'l1'
            },
            'rf': {
                'bootstrap': True,
                'n_estimators': 230,
                'max_depth': 35,
                'max_features': 'auto',
                'min_samples_leaf': 4,
                'min_samples_split': 10,
            }
        }
    }

    # Iterate over model_list
    for model_type in model_list:

        # Logistic Regression fit model
        if model_type == 'lr':
            clf = LogisticRegression(
                C=hyperparams_dict[features_name][model_type]['C'],
                class_weight=hyperparams_dict[features_name][model_type]
                ['class_weight'],
                solver=hyperparams_dict[features_name][model_type]['solver'],
                n_jobs=-1,
                random_state=random_state)

        elif model_type == 'sgd':
            clf = SGDClassifier(
                tol=hyperparams_dict[features_name][model_type]['tol'],
                max_iter=hyperparams_dict[features_name][model_type]
                ['max_iter'],
                penalty=hyperparams_dict[features_name][model_type]['penalty'],
                n_jobs=-1,
                random_state=random_state)

        elif model_type == 'rf':
            clf = RandomForestClassifier(
                max_features=hyperparams_dict[features_name][model_type]
                ['max_features'],
                min_samples_leaf=hyperparams_dict[features_name][model_type]
                ['min_samples_leaf'],
                n_estimators=hyperparams_dict[features_name][model_type]
                ['n_estimators'],
                bootstrap=hyperparams_dict[features_name][model_type]
                ['bootstrap'],
                min_samples_split=hyperparams_dict[features_name][model_type]
                ['min_samples_split'],
                max_depth=hyperparams_dict[features_name][model_type]
                ['max_depth'],
                n_jobs=-1,
                random_state=random_state)
        else:
            raise ValueError("No model type provided")

        # Fit classifier
        print('{} - {}'.format(features_name, model_dict[model_type]))
        clf.fit(X_train, y_train)

        # predictions and evaluations
        predicted = clf.predict(X_test)
        accuracy = evaluate_model(predicted, y_test)

        # Update best performing model if necessary
        if accuracy > best_model['accuracy']:
            best_model['accuracy'] = accuracy
            best_model['model'] = clf
            best_model['type'] = model_type
            best_model['predictions'] = predicted
            best_model['features'] = features_name

    # Return best model and type
    return best_model
コード例 #46
0
ファイル: analyze.py プロジェクト: xyfJASON/Comment-Analysis
def train_SGD(training_set):
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    return SGDClassifier_classifier.train(training_set)
コード例 #47
0
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline(steps=[(
    'select',
    VarianceThreshold()), ('standardize',
                           StandardScaler()), ('classify', SGDClassifier())])

param_grid = {
    'classify__random_state': [0],
    'classify__class_weight': ['balanced'],
    'classify__loss': ['log'],
    'classify__penalty': ['elasticnet'],
    'classify__alpha': 10.0**np.linspace(-3, 1, 10),
    'classify__l1_ratio': [0.15],
}

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           n_jobs=-1,
                           scoring='roc_auc')
コード例 #48
0
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None

    return docs, y


from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer_)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print("Accuracy: {:.3f}".format(clf.score(X_test, y_test)))
コード例 #49
0
brand_labels_test = labels[1]
model_labels_train = labels[2]
model_labels_test = labels[3]
del labels

brand_labels_train = brands_onehot.transform(brand_labels_train).toarray()
brand_labels_test = brands_onehot.transform(brand_labels_test).toarray()
model_labels_train = models_onehot.transform(model_labels_train).toarray()
model_labels_test = models_onehot.transform(model_labels_test).toarray()



# train and tune 1st order model parameters via grid search
classifier_brands = SGDClassifier(
	loss = 'log', 
	penalty = 'elasticnet', 
	random_state = 0, 
	verbose = 1, 
	n_jobs = 3)
classifier_models = SGDClassifier(
	loss = 'log', 
	penalty = 'elasticnet', 
	random_state = 0, 
	verbose = 1, 
	n_jobs = 3)
regressor_brands = SGDRegressor(
	loss = 'squared_loss', 
	penalty = 'elasticnet', 
	random_state = 0, 
	verbose = 1)
regressor_models = SGDRegressor(
	loss = 'squared_loss', 
コード例 #50
0
ファイル: sentiment.py プロジェクト: karansaini282/ColumbiaAI
                spamwriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL,skipinitialspace=True)
                spamwriter.writerow([str(c),str3,'0'])
            file.close()
            c+=1
            break

train = pd.read_csv("imdb_tr.csv", header=0,delimiter=",",encoding='utf-8')
vectorizer1 = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,ngram_range=(1,1))
vectorizer2 = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,ngram_range=(1,2))
vectorizer3 = TfidfVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,ngram_range=(1,1))
vectorizer4 = TfidfVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None,ngram_range=(1,2))
train_data_features1 = vectorizer1.fit_transform(train["text"])
train_data_features2 = vectorizer2.fit_transform(train["text"])
train_data_features3 = vectorizer3.fit_transform(train["text"])
train_data_features4 = vectorizer4.fit_transform(train["text"])
clf1 = SGDClassifier(loss="hinge", penalty="l1")
clf1.fit(train_data_features1, train["polarity"])
clf2 = SGDClassifier(loss="hinge", penalty="l1")
clf2.fit(train_data_features2, train["polarity"])
clf3 = SGDClassifier(loss="hinge", penalty="l1")
clf3.fit(train_data_features3, train["polarity"])
clf4 = SGDClassifier(loss="hinge", penalty="l1")
clf4.fit(train_data_features4, train["polarity"])

test = pd.read_csv("../resource/asnlib/public/imdb_te.csv",encoding='latin-1',header=0)
clean_test=[]
for i in range(len(test['text'])):
    arr1=[w for w in re.split('\W', test['text'][i]) if w]
    str2=" ".join(str(x) for x in arr1)
    str2=str2.lower()
    str3 = ' '.join([word for word in str2.split() if word not in stopArr])
コード例 #51
0
ファイル: or.py プロジェクト: MingJerry/Guide
# sklearn有一套很成熟的管道流程Pipeline,快速搭建机器学习模型神器
bayes_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())
                      ])
bayes_clf.fit(x_train, y_train)
""" Predict the test dataset using Naive Bayes"""
predicted = bayes_clf.predict(x_test)
print('Naive Bayes correct prediction: {:4.4f}'.format(np.mean(predicted == y_test)))
# 输出f1分数,准确率,召回率等指标
print(metrics.classification_report(y_test, predicted, target_names=categories))

""" Support Vector Machine (SVM) classifier"""
svm_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)),
                    ])
svm_clf.fit(x_train, y_train)
predicted = svm_clf.predict(x_test)
print('SVM correct prediction: {:4.4f}'.format(np.mean(predicted == y_test)))
print(metrics.classification_report(y_test, predicted, target_names=categories))
# 输出混淆矩阵
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, predicted))
print('\n')

""" 10-折交叉验证 """
clf_b = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB())
clf_s = make_pipeline(CountVectorizer(), TfidfTransformer(),
                      SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))
コード例 #52
0
for cuisine in train_labels.values():
    if cuisine not in label_map:
        label_map[cuisine] = label_int
        label_map[label_int] = cuisine
        label_int += 1
# pprint(label_map)
for k, v in train_labels.items():
    train_labels[k] = label_map[v]

# train and predict using Tfidf counts and svm
if vectorizer == 'count':
    pipe_svm = Pipeline([('vectorizer', CountVectorizer()),
                         ('classifier',
                          SGDClassifier(loss='log',
                                        penalty='l2',
                                        alpha=1e-3,
                                        n_iter=5,
                                        random_state=42))])
elif vectorizer == 'tfidf':
    pipe_svm = Pipeline([('vectorizer', TfidfVectorizer()),
                         ('classifier',
                          SGDClassifier(loss='log',
                                        penalty='l2',
                                        alpha=1e-3,
                                        n_iter=5,
                                        random_state=42))])

training_data = []
training_labels = []
for data, label in zip(train_data.values(), train_labels.values()):
    training_data.append(data)
コード例 #53
0
ファイル: multiclass.py プロジェクト: jx2181/Parallel-SGD
        time_new_whole = time.time() - start

        time_new = times_sgd[i - 1] + time_new
        times_sgd.append(time_new)

        accuracy = accuracy_test(X_test, y_test, weights)
        accuracies.append(accuracy)

        # print(weights.shape)

        i += 1

    #SGDClassifier
    sgd_best = SGDClassifier(loss='log',
                             penalty='none',
                             tol=0.0,
                             fit_intercept=False,
                             eta0=0.01,
                             learning_rate='constant')

    param_range = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    # param_range = range(1, 10)

    times = []
    train_scores = []
    test_scores = []
    for iteration in param_range:
        sgd_temp = SGDClassifier(loss='log',
                                 penalty='none',
                                 tol=0.000001,
                                 eta0=0.01,
                                 learning_rate='constant',
コード例 #54
0
    # Версия 5, всё таки multi-label пробуем за час до окончания
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MultiLabelBinarizer

    y_multilabel = MultiLabelBinarizer().fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y_multilabel, test_size=0.4, random_state=0)

    print("Train:", len(y_train))
    print("Test:", len(y_test))
    print("Overall:", len(y_multilabel))

    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.multiclass import OneVsRestClassifier
    
    sgd = SGDClassifier(random_state=42) #loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None)
    lr = LogisticRegression()
    clf = OneVsRestClassifier(lr)
    
    
    from skmultilearn.problem_transform import BinaryRelevance
    from sklearn.svm import SVC
    # clf = BinaryRelevance(classifier=SVC(kernel='linear', C=minC, random_state=241), require_dense=[False, True])

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test[1])
    print(y_pred[0])    

    print("Время обучения модели: ", (time.time() - start_time))

    with open(f"{pre_path}finalmodel{model_version}.pkl", 'wb') as f:
コード例 #55
0
def test_mutli_output_classifiation_partial_fit_no_first_classes_exception():
    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
    assert_raises_regex(
        ValueError, "classes must be passed on the first call "
        "to partial_fit.", multi_target_linear.partial_fit, X, y)
コード例 #56
0
from mylib.plotdregion import plot_decision_region

if __name__ == '__main__':
    iris = datasets.load_iris()
    X = iris.data[:, [2, 3]]
    y = iris.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                        random_state=0)

    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    # ml = Perceptron(eta0=0.01, max_iter=40, tol=0,  random_state=0)
    # ml = LogisticRegression(C=1000.0, random_state=0)
    # ml = SVC(kernel='linear', C=1.0, random_state=0)
    # ml = SGDClassifier(loss = 'perceptron')
    # ml = SGDClassifier(loss='log')
    ml = SGDClassifier(loss='hinge')

    ml.fit(X_train_std, y_train)
    y_pred = ml.predict(X_test_std)
    print('총 테스트 개수: %d, 오류개수:%d' % (len(y_test), (y_test != y_pred).sum()))
    print('정확도: %.2f'%accuracy_score(y_test, y_pred))

    X_combined_std = np.vstack((X_train_std, X_test_std))
    y_combined = np.hstack((y_train, y_test))
    plot_decision_region(X=X_combined_std, y=y_combined, classifier=ml,
                         test_idx=range(105, 150), title='scikit-learn SVM')
コード例 #57
0
                  (KNeighborsClassifier(n_neighbors=10),
                   "kNN"), (RandomForestClassifier(n_estimators=100), "RF")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(
        benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(
        benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(
    benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
コード例 #58
0

warnings.filterwarnings("ignore")

TDATA = pd.read_csv('Data.csv')

X_train, x_test, y_train, y_test = train_test_split(TDATA.COMBINED,
                                                    TDATA.FLAIR,
                                                    test_size=0.3,
                                                    random_state=7)

LSVM = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                 ('clf',
                  SGDClassifier(loss='hinge',
                                penalty='l2',
                                alpha=0.0001,
                                random_state=42,
                                max_iter=150,
                                tol=None))])

LSVM.fit(X_train, y_train)

REDDIT = praw.Reddit(user_agent='redditflairdetector',
                     client_id='PY-6WwMrA9O48Q',
                     client_secret='rwwa13TTlmWYSeD8D9_kW13r6UE')

SUBREDDIT = REDDIT.subreddit('india')

SAVED_MODEL = pickle.dumps(LSVM)
LOADED_MODEL = pickle.loads(SAVED_MODEL)

print(
コード例 #59
0
    parser.add_argument('test', type=load_npz, help='Test features (npz)')
    parser.add_argument('output', help='Output label predictions (npz)')
    return parser


if __name__ == "__main__":
    args = opts().parse_args()

    print "Loading and preparing data"
    X = prepare_features(args.train)
    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)
    Y = args.labels['labels']

    print "Training classifier"
    clfs = [SGDClassifier(loss='log') for y in Y.T]
    for clf, y in zip(clfs, Y.T):
        try:
            clf.fit(X, y)
        except:
            pass
    del X, Y

    print "Predicting"
    X = scaler.transform(prepare_features(args.test))
    p = []
    for clf in clfs:
        try:
            p.append(clf.predict_proba(X)[:, 0])
        except:
            p.append(np.zeros(len(X)))
コード例 #60
0
def define_clfs_params():
    '''
    Defines all relevant parameters and classes for classfier objects.
    '''
    clfs = {
        'RF':
        RandomForestClassifier(n_estimators=50, n_jobs=-1),
        'ET':
        ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
        'AB':
        AdaBoostClassifier(DecisionTreeClassifier(max_depth=[1, 5, 10, 15]),
                           algorithm="SAMME",
                           n_estimators=200),
        'LR':
        LogisticRegression(penalty='l1', C=1e5),
        'SVM':
        svm.SVC(kernel='linear', probability=True, random_state=0),
        'GB':
        GradientBoostingClassifier(learning_rate=0.05,
                                   subsample=0.5,
                                   max_depth=6,
                                   n_estimators=10),
        'NB':
        GaussianNB(),
        'DT':
        DecisionTreeClassifier(),
        'SGD':
        SGDClassifier(loss='log', penalty='l2'),
        'KNN':
        KNeighborsClassifier(n_neighbors=3)
    }
    params = {
        'RF': {
            'n_estimators': [1, 10, 100, 1000],
            'max_depth': [10, 15, 20, 30, 40, 50, 60, 70, 100],
            'max_features': ['sqrt', 'log2'],
            'min_samples_split': [2, 5, 10],
            'random_state': [1]
        },
        'LR': {
            'penalty': ['l1', 'l2'],
            'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10],
            'random_state': [1]
        },
        'SGD': {
            'loss': ['log', 'perceptron'],
            'penalty': ['l2', 'l1', 'elasticnet'],
            'random_state': [1]
        },
        'ET': {
            'n_estimators': [1, 10, 100, 1000],
            'criterion': ['gini', 'entropy'],
            'max_depth': [1, 3, 5, 10, 15],
            'max_features': ['sqrt', 'log2'],
            'min_samples_split': [2, 5, 10],
            'random_state': [1]
        },
        'AB': {
            'algorithm': ['SAMME', 'SAMME.R'],
            'n_estimators': [1, 10, 100, 1000],
            'random_state': [1]
        },
        'GB': {
            'n_estimators': [1, 10, 100, 1000],
            'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5],
            'subsample': [0.1, 0.5, 1.0],
            'max_depth': [1, 3, 5, 10, 20, 50, 100],
            'random_state': [1]
        },
        'NB': {},
        'DT': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [15, 20, 30, 40, 50],
            'max_features': ['sqrt', 'log2'],
            'min_samples_split': [2, 5, 10],
            'random_state': [1]
        },
        'SVM': {
            'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10],
            'kernel': ['linear'],
            'random_state': [1]
        },
        'KNN': {
            'n_neighbors': [1, 5, 10, 25, 50, 100],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
    }

    return clfs, params