コード例 #1
0
def grid_search_svm(X_train, y_train,X_test,ngrams,n_split,svm_choice='linear',tfidf_choice=False,nums_train=None,nums_test=None):
    svm=None
    grid=None

    if svm_choice == 'linear':
        svm = LinearSVC()
        c_array = np.logspace(1., 4., num=4)
        if tfidf_choice:
            grid = {'vect__ngram_range': ngrams, 'tfidf__use_idf': (True, False),
                       'clf__C': c_array.tolist()}
        else:
            grid = {'vect__ngram_range': ngrams,
                    'clf__C': c_array.tolist()}

    elif svm_choice == 'svc':
        svm = SVC()
        c_array = np.logspace(-3., 6., num=10)
        g_array = np.logspace(-3., 3., num=7)
        if tfidf_choice:
            grid = {'vect__ngram_range': ngrams,
                'tfidf__use_idf': (True, False),
                'clf__kernel': ['rbf'],
                'clf__C': c_array.tolist(),
                'clf__gamma': g_array.tolist()}
        else:
            grid = {'vect__ngram_range': ngrams,
                    'clf__kernel': ['rbf'],
                    'clf__C': c_array.tolist(),
                    'clf__gamma': g_array.tolist()}

    if type(nums_train) is np.ndarray and type(nums_test) is np.ndarray:
        if tfidf_choice:
            clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)),
                                     ('tfidf', TfidfTransformer(smooth_idf=False)),
                                     ('numfeat', NumFeatureAdder(nums_train,nums_test)),
                                     ('clf',svm)])
        else:
            clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)),
                                     ('numfeat', NumFeatureAdder(nums_train, nums_test)),
                                     ('clf', svm)])
    else:
        if tfidf_choice:
            clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)),
                                     ('tfidf', TfidfTransformer(smooth_idf=False)),
                                     ('clf',svm)])
        else:
            clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)),
                                     ('clf',svm)])
    print(clf_pipeline.get_params().keys())

    sc = SparkContext.getOrCreate()
    grid_search = GridSearchCV(sc, clf_pipeline, grid, n_jobs=-1, cv=n_split)
    grid_search.fit(X_train, y_train)
    grid_search_predicted = grid_search.predict(X_test)

    return grid_search_predicted
コード例 #2
0
class Classifier:
	def __init__(self):
		self.model = None

	def preprocess(self, data):
		''' Optional for preprocessing'''
		return CountVectorizer().fit_transform(data)

	def train(self,X, y, method="rf"):
		param_grid = {
		  "max_depth": [6, None],
		  "max_features": [5,10,20],
		}
		obj = RandomForestClassifier()
		if method == "svm":
			obj = SVC()
		self.model = GridSearchCV(RandomForestClassifier(), param_grid=param_grid)
		self.model.fit(X, y)

	def preidict(self, X):
		if self.model is None:
			return 
		return self.model.predict(X)
コード例 #3
0
class Classifier:
    def __init__(self):
        self.model = None

    def preprocess(self, data):
        ''' Optional for preprocessing'''
        return CountVectorizer().fit_transform(data)

    def train(self, X, y, method="rf"):
        param_grid = {
            "max_depth": [6, None],
            "max_features": [5, 10, 20],
        }
        obj = RandomForestClassifier()
        if method == "svm":
            obj = SVC()
        self.model = GridSearchCV(RandomForestClassifier(),
                                  param_grid=param_grid)
        self.model.fit(X, y)

    def preidict(self, X):
        if self.model is None:
            return
        return self.model.predict(X)
# In[99]:

tuned_parameters = {
    "n_estimators": [ 100 ],
    "max_depth" : [ 3 ],
    "learning_rate": [ 0.1 ],
}
gbc = ensemble.GradientBoostingClassifier()
clf = GridSearchCV(spark.sparkContext, gbc, tuned_parameters)
clf


# In[100]:

clf.fit(X_timetrain, Y_timetrain_arr)
clftest_pred = clf.predict(X_timetest)
print "Accuracy is ", metrics.accuracy_score(Y_timetest_arr, clftest_pred) *100, "%"


# In[101]:

knn1 = KNeighborsClassifier()
knn_params = {
    "n_neighbors": [31]
}
clf2 = GridSearchCV(spark.sparkContext, knn1, knn_params, n_jobs = 2)
clf2


# In[102]:
コード例 #5
0
    SPARK_HOME + 'python/lib/pyspark.zip',
    SPARK_HOME + 'python/lib/py4j-0.10.1-src.zip']
)

from pyspark import SparkContext
from pyspark import SparkConf


if __name__ == '__main__':
    conf = SparkConf()
    conf.setMaster("local[3]")
    # 指定具体的Master机器 地址和端口
    # conf.setMaster("spark://jdwang-HP:7077")
    conf.setAppName("spark_test")
    # 可以设置属性等
    # conf.set("spark.executor.memory", "12g")
    sc = SparkContext(conf=conf)
    # 测试
    from sklearn import svm, datasets
    from spark_sklearn import GridSearchCV

    iris = datasets.load_iris()
    parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
    svr = svm.SVC()
    clf = GridSearchCV(sc, svr, parameters)
    clf.fit(iris.data, iris.target)
    print(clf.best_params_)
    print(clf.predict(iris.data))

end_time = time.time()
print('running time is %ds'%(end_time-start_time))
コード例 #6
0
sc = SparkContext(conf=conf)

digits = load_digits()
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
X_train, X_test, y_train, y_test = train_test_split(data, digits.target, test_size=0.3, random_state=0)

svc = svm.SVC()

hyperparam_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': np.linspace(0.001, 0.01, num=10),
    'C': np.linspace(1, 10, num=10),
    'tol': np.linspace(0.01, 0.1, 10)
}

classifier = GridSearchCV(sc, svc, hyperparam_grid)

start = time()
classifier.fit(X_train, y_train)
elapsed = time() - start

print('elapsed: {} seconds'.format(elapsed))

print('Best Kernel:\t{}'.format(classifier.best_estimator_.kernel))
print('Best Gamma:\t{}'.format(classifier.best_estimator_.gamma))
print('Best C:\t\t{}'.format(classifier.best_estimator_.C))

y_pred = classifier.predict(X_test)
print('Accuracy:\t{:.1%}'.format(metrics.accuracy_score(y_test, y_pred)))
コード例 #7
0
ファイル: predict_MLib.py プロジェクト: yennanliu/analysis
# python 2.7 


#  import pyspark library 
from pyspark import SparkConf, SparkContext

# spark_sklearn provides the same API as sklearn but uses Spark MLLib 
# under the hood to perform the actual computations in a distributed way 
# (passed in via the SparkContext instance).
from spark_sklearn import GridSearchCV
# import ML library
from sklearn import svm, grid_search, datasets


sc =SparkContext()


iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svr = svm.SVC()
clf = GridSearchCV(sc, svr, parameters)
clf.fit(iris.data, iris.target)
print ("==================")
print (clf.predict(iris.data))
print ("==================")
コード例 #8
0
                        ('spark.executor.instances', j[2])])
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print(sc._conf.getAll())

for i in iter_list:
    print('--------------------Iterations = ', i, '-----------------------')
    param_grid = {
        "solver": ["sgd"],
        "max_iter": [i],
        "hidden_layer_sizes": [(100, 10)],
    }
    gs = GridSearchCV(sc, estimator=MLPClassifier(), param_grid=param_grid)
    print('Time info for iterations = ', i)
    get_ipython().run_line_magic('time', 'gs.fit(train, y_train)')

    preds = gs.predict(test)
    print('Accuracy=', np.sum(y_test == preds) * 100 / len(y_test), '%')

#### CONFIG 2 ########
j = exec_config[1]
print('-----------------  Config = ', j, ' -------------------------')
conf = sc._conf.setAll([('spark.executor.memory', j[0]),
                        ('spark.executor.cores', j[1]),
                        ('spark.executor.instances', j[2])])
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print(sc._conf.getAll())

for i in iter_list:
    print('--------------------Iterations = ', i, '-----------------------')
    param_grid = {
        "solver": ["sgd"],