def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) clf = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='entropy', max_depth=29008, max_features=36, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=4494, n_jobs=8, oob_score=False, random_state=979271, verbose=0, warm_start=False) clf.fit(train_x, train_y) ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv="prefit") ccv.fit(valid_x,valid_y) valid_predictions = ccv.predict_proba(valid_x) test_predictions= ccv.predict_proba(test_x) loss = test(valid_y,valid_predictions,True) if loss<0.52: data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) rf = RandomForestClassifier(n_jobs=8) param_dist = { "n_estimators":sp_randint(100,300), "criterion": ["gini"], #"max_depth": sp_randint(3, 10000), #"min_samples_split": sp_randint(1, 300), #"min_samples_leaf": sp_randint(1, 300), "max_features": sp_randint(10, 26), "bootstrap": [True, False], 'random_state':sp_randint(1, 1000000), } clf = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50,cv=10,scoring='roc_auc') clf.fit(train_x, train_y) valid_predictions = clf.predict_proba(valid_x)[:, 1] test_predictions= clf.predict_proba(test_x)[:, 1] loss = roc_auc_score(valid_y,valid_predictions) print('loss:') print(loss) print(clf.best_estimator_) data.saveData(valid_id,valid_predictions,"./valid_results/valid_"+str(model_id)+".csv") data.saveData(test_id,test_predictions,"./results/results_"+str(model_id)+".csv")
def createSignalFromWav(file_wav_name, file_data_name): try: with wave.open(file_wav_name, mode='rb') as wav: sample_rate = wav.getframerate() n_frame = wav.getnframes() # Duração do áudio duration = n_frame / sample_rate # Buffer de Dados dataBuffer = wav.readframes(n_frame) # Descompacta os bytes em pacotes de 2 bytes (short) DataGenerator = struct.unpack("%ih" % (n_frame * wav.getnchannels()), dataBuffer) # Converte 16bits wav para float, entre -1.0 e 1.0 DataGenerator = [float(value) / pow(2, 15) for value in DataGenerator] # Cria nosso sinal e salva em um file dataSignal = DataSignal(sample_rate, duration, len(DataGenerator), DataGenerator) data.saveData(file_data_name, dataSignal) return True except: return False
def saveSettings(self): settings.settings.setValue("data/MaximumURLLength", self.maximumURLLength.value()) #settings.settings.setValue("data/MaximumCacheSize", self.maximumCacheSize.value()) settings.settings.setValue("data/RememberHistory", self.rememberHistoryToggle.isChecked()) settings.settings.setValue("network/GeolocationEnabled", self.geolocationToggle.isChecked()) data.geolocation_whitelist = [self.geolocationWhitelist.item(authority).text() for authority in range(0, self.geolocationWhitelist.count())] data.geolocation_blacklist = [self.geolocationBlacklist.item(authority).text() for authority in range(0, self.geolocationBlacklist.count())] data.saveData()
def prepareQuit(): try: os.remove(settings.crash_file) except: pass common.downloadManager.saveSession() saveSession() settings.settings.hardSync() data.saveData() data.data.hardSync() filtering.adblock_filter_loader.quit() filtering.adblock_filter_loader.wait() server_thread.httpd.shutdown() server_thread.quit() server_thread.wait()
def trainrf(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) clf = RandomForestClassifier(n_estimators=random.randint(50,5000), criterion='gini', max_depth=random.randint(10,1000), min_samples_split=random.randint(2,50), min_samples_leaf=random.randint(1,10), min_weight_fraction_leaf=random.uniform(0.0,0.5), max_features=random.uniform(0.1,1.0), max_leaf_nodes=random.randint(1,10), bootstrap=False, oob_score=False, n_jobs=30, random_state=random_state, verbose=0, warm_start=True, class_weight=None ) clf.fit(train_x, train_y) valid_predictions1 = clf.predict_proba(valid_x) test_predictions1= clf.predict_proba(test_x) t1 = test(valid_y,valid_predictions1) ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv='prefit') ccv.fit(valid_x,valid_y) valid_predictions2 = ccv.predict_proba(valid_x) test_predictions2= ccv.predict_proba(test_x) t2 = test(valid_y,valid_predictions2) if t2<t1: valid_predictions=valid_predictions2 test_predictions=test_predictions2 t=t2 else: valid_predictions=valid_predictions1 test_predictions=test_predictions1 t=t1 if t < 0.450: data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def saveSettings(self): settings.settings.setValue("data/MaximumURLLength", self.maximumURLLength.value()) #settings.settings.setValue("data/MaximumCacheSize", self.maximumCacheSize.value()) settings.settings.setValue("data/RememberHistory", self.rememberHistoryToggle.isChecked()) settings.settings.setValue("network/GeolocationEnabled", self.geolocationToggle.isChecked()) data.geolocation_whitelist = [ self.geolocationWhitelist.item(authority).text() for authority in range(0, self.geolocationWhitelist.count()) ] data.geolocation_blacklist = [ self.geolocationBlacklist.item(authority).text() for authority in range(0, self.geolocationBlacklist.count()) ] data.saveData()
def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) # normalization scaler = StandardScaler() train_x = scaler.fit_transform(train_x) valid_x = scaler.transform(valid_x) test_x = scaler.transform(test_x) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=1000, multi_class='ovr', verbose=True ) clf.fit(train_x, train_y) valid_predictions = clf.predict_proba(valid_x) test(valid_y,valid_predictions) ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv='prefit') ccv.fit(train_x,train_y) valid_predictions = ccv.predict_proba(valid_x) test(valid_y,valid_predictions) test_predictions= ccv.predict_proba(test_x) data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def signalGenerator(file_name, signalGen): try: # List com os dados para serem salvos DataGenerator = [] ''' FS -> SAMPLE FRAME É necessário para gerar a amostragem do sinal A cada sample_rate, teremos um ciclo do sinal gerado ''' FS = 1.0 / signalGen.sample_rate ''' A fórmula utilizada para gerar o sinal foi Magnetude*sin(Frequence*2*PI*FS*x + Phase) O X representa a posição da amostra, e como dito, a cada sample_rate*FS teremos o sinal de repetindo ''' for g in range(0, signalGen.sample_rate*signalGen.duration): DataGenerator.append(0) for i in range(0, len(signalGen.frequencies)): # Magnetude é opcional, portanto, se não encontrar dados, adiciona 1 como padrão if(i >= len(signalGen.magnetude)): signalGen.magnetude.append(1) # Phase é opcional, portanto, se não encontrar dados, adiciona 0º como padrão if(i >= len(signalGen.phases)): signalGen.phases.append(0) DataGenerator[g] += signalGen.magnetude[i]*math.sin(signalGen.frequencies[i]*2*math.pi*g*FS + (signalGen.phases[i]*math.pi / 180)); # Gera a classe e sinal e salva o objeto no arquivo de dados dataSignal = DataSignal(signalGen.sample_rate, signalGen.duration, len(DataGenerator), DataGenerator) data.saveData(file_name, dataSignal) return True except: return False
def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) # build a classifier clf = RandomForestClassifier(n_jobs=8) # specify parameters and distributions to sample from param_dist = { "n_estimators":sp_randint(20,40), "criterion": ["gini", "entropy"], "max_depth": sp_randint(3, 10000), "min_samples_split": sp_randint(1, 30), "min_samples_leaf": sp_randint(1, 30), "max_features": sp_randint(1, 93), "bootstrap": [True, False], 'random_state':sp_randint(1, 1000000), } # run randomized search random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=2,cv=9,n_jobs=3) random_search.fit(train_x,train_y) valid_predictions = random_search.predict_proba(valid_x) test_predictions= random_search.predict_proba(test_x) loss = test(valid_y,valid_predictions,True) if loss<10.438: output=[loss,random_search.best_estimator_] print("model[\""+str(model_id)+"\"]="), print(output) data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def trainxgb(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) xgb = XGBoostClassifier(base_estimator='gbtree', objective='multi:softprob', metric='mlogloss', num_classes=9, learning_rate=random.uniform(0.01,0.05), max_depth=random.randint(10,20), max_samples=random.uniform(0.0,1.0), max_features=random.uniform(0.0,1.0), max_delta_step=random.randint(1,10), min_child_weight=random.randint(1,10), min_loss_reduction=1, l1_weight=0.0, l2_weight=0.0, l2_on_bias=False, gamma=0.02, inital_bias=random.uniform(0.0,1.0), random_state=random_state, watchlist=[[valid_x,valid_y]], n_jobs=30, n_iter=3000, ) xgb.fit(train_x, train_y) valid_predictions = xgb.predict_proba(valid_x) if test(valid_y,valid_predictions) <0.450: test_predictions= xgb.predict_proba(test_x) data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
from time import sleep # for triggering the modules import data as dm import motor as mm import webcam as wm import joystick as jm maxThrottle = 0.25 motor = mm.Motor(2, 3, 4, 17, 22, 27) record = 0 while True: joyval = jm.getJS() steering = joyval['axis1'] throttle = joyval['o'] * maxThrottle if joyval['share'] == 1: if record ==0: print('Recording Started ...') record +=1 sleep(0.300) if record == 1: img = wm.getImg(True, size=[240,120]) dm.saveData(img, steering) elif record == 2: dm.saveLog() record = 0 motor.move(throttle, -steering) cv2.waitKey(1)
from sklearn.decomposition import NMF from data import loadData, saveData import numpy as np # 最重要的参数是n_components、alpha、l1_ratio、solver nmf = NMF( n_components=128, # k value,默认会保留全部特征 # W H 的初始化方法,'random' | 'nndsvd'(默认) | 'nndsvda' | 'nndsvdar' | 'custom', init=None, solver='cd', # 'cd' | 'mu' # {'frobenius', 'kullback-leibler', 'itakura-saito'}, beta_loss='frobenius', tol=1e-10, # 停止迭代的极限条件 max_iter=200, # 最大迭代次数 random_state=None, alpha=0., # 正则化参数 l1_ratio=0., # 正则化参数 verbose=0, # 冗长模式 shuffle=False # 针对"cd solver" ) trius = loadData(filename='trius.npy') X = np.abs(trius) nmf.fit(X) W = nmf.fit_transform(X) H = nmf.components_ print('reconstruction_err_', nmf.reconstruction_err_) # 损失函数值 print('n_iter_', nmf.n_iter_) # 实际迭代次数 saveData(W, filename='nmf.npy') saveData(H, filename='basis.npy')
i * batch_size:len(x_train)] # Run _, c = sess.run([optimizer, loss], feed_dict={X: batch_xs}) if epoch % display_step == 0: print("Iteration: %04d " % (epoch), "loss=", "{:.9f}".format(c)) print('Triang CLF') for epoch in range(training_epoch): # Loop over all batches for i in range(total_batch): batch_xs = x_train[i * batch_size:(i + 1) * batch_size] if i < total_batch else x_train[ i * batch_size:len(x_train)] batch_ys = y_train[i * batch_size:(i + 1) * batch_size] if i < total_batch else y_train[ i * batch_size:len(y_train)] # Run _, c = sess.run([optimizer_clf, loss_clf], feed_dict={ X: batch_xs, Y: batch_ys }) if epoch % display_step == 0: a = sess.run(acc, feed_dict={X: x_test, Y: y_test}) mse = sess.run(loss, feed_dict={X: x_test}) print("Iteration: %04d " % (epoch), "loss=", "{:.9f} acc {:.9f} decode loss {:.9f}".format(c, a, mse)) code = sess.run(encoder_op, feed_dict={X: F}) saveData(code, filename='code.npy')
def main(): path = "../Data/google_trace_timeseries/data_resource_usage_5Minutes_6176858948.csv" aspects = ["meanCPUUsage", "canonical memory usage"] predicted_aspect = "meanCPUUsage" num_epochs = 1000 learning_rate = 0.005 n_slidings_encoder = [26, 28] n_slidings_decoder = [2, 4, 6] batch_sizes = [16, 32] size_models = [[16], [32], [8, 4], [16, 8]] activations = ["tanh", "sigmoid"] input_keep_probs = [0.95, 0.9] output_keep_probs = [0.9] state_keep_probs = [0.95, 0.9] # n_slidings_encoder = [16] # n_slidings_decoder = [2] # batch_sizes = [16] # size_models = [[4, 2]] # activations = ["tanh"] # input_keep_probs = [0.95] # output_keep_probs = [0.9] # state_keep_probs = [0.95] rate = 5 result_file_path = 'result_encoder_decoder.csv' loss_file_path = 'loss_encoder_decoder.csv' ## GET COMBINATIONS ## combinations = [] for n_sliding_encoder in n_slidings_encoder: for n_sliding_decoder in n_slidings_decoder: for batch_size in batch_sizes: for size_model in size_models: for activation in activations: for input_keep_prob in input_keep_probs: for output_keep_prob in output_keep_probs: for state_keep_prob in state_keep_probs: combination_i = [n_sliding_encoder, n_sliding_decoder, batch_size, size_model, activation, input_keep_prob, output_keep_prob, state_keep_prob] combinations.append(combination_i) for combination in combinations: tf.reset_default_graph() n_sliding_encoder = combination[0] n_sliding_decoder = combination[1] batch_size = combination[2] size_model = combination[3] activation = combination[4] input_keep_prob = combination[5] output_keep_prob = combination[6] state_keep_prob = combination[7] ### GET DATA : TRAINING SET, TEST SET, VALIDATION SET ### nor_data, amax, amin = data.get_goodletrace_data(path, aspects) x_train_encoder, y_train, x_test_encoder, y_test = data.get_data_samples(nor_data, n_sliding_encoder, predicted_aspect, rate) x_train_decoder, x_test_decoder = data.get_data_decoder(x_train_encoder, x_test_encoder, n_sliding_decoder) x_train_encoder, x_train_decoder, y_train, x_val_encoder, x_val_decoder, y_val = \ data.getValidationSet(x_train_encoder, x_train_decoder, y_train, 5) # print(x_train_encoder.shape, x_train_decoder.shape, y_train.shape, x_val_encoder.shape, x_val_decoder.shape, # y_val.shape) # return 0 loss_train_value = [] loss_valid_value = [] n_train = y_train.shape[0] num_batches = int(x_train_encoder.shape[0] / batch_size) timestep_encoder = n_sliding_encoder timestep_decoder = n_sliding_decoder input_dim = len(aspects) X_encoder = tf.placeholder(tf.float32, [None, timestep_encoder, input_dim], name='X_encoder') X_decoder = tf.placeholder(tf.float32, [None, timestep_decoder, input_dim], name='X_decoder') y = tf.placeholder(tf.float32, [None, 1], name='output') output, outputs_encoder, outputs_decoder = encoder_decoder(X_encoder, X_decoder, size_model, activation, input_keep_prob, output_keep_prob, state_keep_prob) outputs_encoder = tf.identity(outputs_encoder, name='outputs_encoder') loss = tf.reduce_mean(tf.squared_difference(output, y)) optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss) init_op = tf.global_variables_initializer() with tf.Session() as sess: t = datetime.now().time() start_time = (t.hour * 60 + t.minute) * 60 + t.second sess.run(init_op) ## EARLY STOPPING ## pre_loss_valid = 100 x = 0 early_stopping_val = 5 ### START TO TRAIN ### for i in range(num_epochs): num_epochs_i = i + 1 for j in range(num_batches + 1): a = batch_size * j b = a + batch_size if b > n_train: b = n_train x_batch_encoder = x_train_encoder[a:b, :, :] x_batch_decoder = x_train_decoder[a:b, :, :] y_batch = y_train[a:b, :] # print(x_batch.shape, y_batch.shape) loss_j, _ = sess.run([loss, optimizer], feed_dict={X_encoder: x_batch_encoder, X_decoder: x_batch_decoder, y: y_batch}) loss_train_i = sess.run(loss, feed_dict={X_encoder: x_train_encoder, X_decoder: x_train_decoder, y: y_train}) loss_valid_i = sess.run(loss, feed_dict={X_encoder: x_val_encoder, X_decoder: x_val_decoder, y: y_val}) # print(num_epochs_i, loss_train_i, loss_valid_i) loss_train_value.append(loss_train_i) loss_valid_value.append(loss_valid_i) if loss_valid_i > pre_loss_valid: x = x+1 if x == early_stopping_val: break else: x = 0 pre_loss_valid = loss_valid_i ### OUTPUT ### output_test = sess.run(output, feed_dict={X_encoder: x_test_encoder, X_decoder: x_test_decoder, y: y_test}) output_test = output_test * (amax[0] - amin[0]) + amin[0] y_test_act = y_test * (amax[0] - amin[0]) + amin[0] loss_test_act = np.mean(np.abs(output_test - y_test_act)) # print(loss_test_act) t = datetime.now().time() end_time = (t.hour * 60 + t.minute) * 60 + t.second training_encoder_time = (end_time - start_time) ### SAVE DATA ### name = data.saveData(combination, loss_test_act, num_epochs_i, result_file_path, training_encoder_time) # print(name) # outputs_encoder = sess.run(outputs_encoder, feed_dict={X_encoder: x_train_encoder, # X_decoder: x_train_decoder, # y: y_train}) # print(outputs_encoder[:, -1, :].shape) # print(time) ### SAVE MODEL ### # print('\nSaving...') cwd = os.getcwd() saved_path = 'model/model' saved_path += str(combination) saved_path += '.ckpt' saved_path = os.path.join(cwd, saved_path) # print(saved_path) shutil.rmtree(saved_path, ignore_errors=True) saver = tf.train.Saver() saver.save(sess=sess, save_path=saved_path) # print("ok") sess.close()
def main(): path = "../Data/google_trace_timeseries/data_resource_usage_5Minutes_6176858948.csv" aspects = ["meanCPUUsage", "canonical memory usage"] predicted_aspect = "meanCPUUsage" n_slidings = [3, 4, 5, 6] batch_sizes = [16, 32] learning_rate = 0.005 num_epochs = 2 rnn_cellsizes = [[4], [8], [16], [32], [4, 2], [8, 4], [16, 4], [16, 8], [32, 4], [32, 8], [32, 16]] activations = ["tanh", "sigmoid"] rate = 5 result_file_path = 'result_multi.csv' combination = [] for n_sliding in n_slidings: for batch_size in batch_sizes: for rnn_cellsize in rnn_cellsizes: for activation in activations: combination_i = [ n_sliding, batch_size, rnn_cellsize, activation ] combination.append(combination_i) for combination_i in combination: tf.reset_default_graph() n_sliding = combination_i[0] batch_size = combination_i[1] rnn_unit = combination_i[2] activation = combination_i[3] nor_data, amax, amin = data.get_goodletrace_data(path, aspects) x_train, y_train, x_test, y_test = data.get_data_samples( nor_data, n_sliding, predicted_aspect, rate) # x_train, y_train, x_valid, y_valid = data.getValidationSet(x_train, y_train, 5) loss_train_value = [] loss_valid_value = [] n_train = x_train.shape[0] num_batches = int(x_train.shape[0] / batch_size) timestep = n_sliding input_dim = len(aspects) X = tf.placeholder(tf.float32, [None, timestep, input_dim]) y = tf.placeholder(tf.float32, [None, 1]) output = model_rnn(X, rnn_unit, activation) loss = tf.reduce_mean(tf.squared_difference(output, y)) optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss) init_op = tf.global_variables_initializer() with tf.Session() as sess: t = datetime.now().time() start_time = (t.hour * 60 + t.minute) * 60 + t.second sess.run(init_op) # pre_loss_valid = 100 # x = 0 # early_stopping_val = 5 epoch_i = 0 for i in range(num_epochs): for j in range(num_batches + 1): a = batch_size * j b = a + batch_size if b > n_train: b = n_train x_batch = x_train[a:b, :, :] y_batch = y_train[a:b, :] # print(x_batch.shape, y_batch.shape) loss_j, _ = sess.run([loss, optimizer], feed_dict={ X: x_batch, y: y_batch }) loss_train_i = sess.run(loss, feed_dict={ X: x_train, y: y_train }) # loss_valid_i = sess.run(loss, feed_dict={X: x_valid, # y: y_valid}) loss_train_value.append(loss_train_i) # loss_valid_value.append(loss_valid_i) # if loss_valid_i > pre_loss_valid: # x = x+1 # if x == early_stopping_val: # break # else: # x = 0 # pre_loss_valid = loss_valid_i epoch_i += 1 t = datetime.now().time() train_time = (t.hour * 60 + t.minute) * 60 + t.second training_time = train_time - start_time output_test = sess.run(output, feed_dict={X: x_test, y: y_test}) output_test = output_test * (amax[0] - amin[0]) + amin[0] y_test_act = y_test * (amax[0] - amin[0]) + amin[0] t = datetime.now().time() test_time = (t.hour * 60 + t.minute) * 60 + t.second testing_time = test_time - train_time system_time = test_time - start_time loss_test_act = np.mean(np.abs(output_test - y_test_act)) explained_variance_score = sk.explained_variance_score( y_test_act, output_test) mean_absolute_error = sk.mean_absolute_error( y_test_act, output_test) mean_squared_error = sk.mean_squared_error(y_test_act, output_test) median_absolute_error = sk.median_absolute_error( y_test_act, output_test) r2_score = sk.r2_score(y_test_act, output_test) # t = datetime.now().time() # end_time = (t.hour * 60 + t.minute) * 60 + t.second # # training_time = (end_time - start_time) name = data.saveData(combination_i, loss_test_act, loss_valid_value, loss_train_value, epoch_i, result_file_path, output_test, y_test_act, explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score, training_time, testing_time, system_time) print(name)
opt = { 'architecture' : [784,784,784,30,10], 'learningRate' : 1.5, 'error' : 0.001, 'epochs' : 15, 'batch' : 100 } nn = ann.ann(opt) # #combain # rbm = files.loadData('rbm-1000.db') # nn.rbm(rbm) #train nn.train(train_data, train_result) files.saveData(nn, 'nn.db') _results = nn.sim(test_data) _results = _results.transpose() accuracy = 0 for i in range(len(test_result)): if i < 20: print _results[i].argmax(), " : ", test_result[i].argmax() if _results[i].argmax() == test_result[i].argmax(): accuracy += 1.00 print accuracy, " / ", len(test_result) accuracy = accuracy/len(test_result)
from sklearn.utils import shuffle from sklearn.svm import SVC from sklearn.grid_search import GridSearchCV from test import test train_x,train_y,valid_x,valid_y,test_x=data.loadData() train_x,train_y=shuffle(train_x,train_y) param_grid = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, ] clf = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=True) clf.fit(train_x, train_y) #gs = GridSearchCV(svc,param_grid,n_jobs=8,verbose=2) #gs.fit(train_x, train_y) valid_predictions = clf.predict_proba(valid_x) test_predictions= clf.predict_proba(test_x) test(valid_y,valid_predictions) data.saveData(valid_predictions,"../valid_results/valid_215.csv") data.saveData(test_predictions,"../results/results_215.csv")
def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) maximum_auc=0.0 random_state=random.randint(0, 1000000) for i in tqdm(range(1000)): config={ 'base_estimator':'gbtree', 'objective':'multi:softprob', 'metric':'mlogloss', 'num_classes':2, 'learning_rate':random.uniform(0.01,0.15), 'max_depth':20+random.randint(0,10), 'max_samples':random.uniform(0.3,1.0), 'max_features':random.uniform(0.3,1.0), 'max_delta_step':random.randint(1,10), 'min_child_weight':random.randint(1,8), 'min_loss_reduction':1, 'l1_weight':random.uniform(0.0,10.0), 'l2_weight':random.uniform(0.0,10.0), 'l2_on_bias':False, 'gamma':random.uniform(0.0,0.1), 'inital_bias':0.5, 'random_state':random_state, } clf = XGBoostClassifier( config['base_estimator'], config['objective'], config['metric'], config['num_classes'], config['learning_rate'], config['max_depth'], config['max_samples'], config['max_features'], config['max_delta_step'], config['min_child_weight'], config['min_loss_reduction'], config['l1_weight'], config['l2_weight'], config['l2_on_bias'], config['gamma'], config['inital_bias'], config['random_state'], watchlist=[[valid_x,valid_y]], n_jobs=8, n_iter=30000, ) clf.fit(train_x, train_y) valid_predictions = clf.predict_proba(valid_x)[:, 1] test_predictions= clf.predict_proba(test_x)[:, 1] auc = roc_auc_score(valid_y,valid_predictions) if auc>maximum_auc: maximum_auc=auc best_config=config print('new auc:') print(auc) data.saveData(valid_id,valid_predictions,"./valid_results/valid_"+str(model_id)+".csv") data.saveData(test_id,test_predictions,"./results/results_"+str(model_id)+".csv") print('maximum_auc:') print(maximum_auc) print(config)
# plt.show() def load_save_label(filename): binfile = open(filename , 'rb') buf = binfile.read() labelNum = len(buf) labelNum -= 8 # print labelNum index = 0 labels = struct.unpack_from('>'+str(labelNum)+'B' , buf , index); print 'success: '+filename return labels train_data = load_save_function('train-images-idx3-ubyte'); train_result = load_save_label('train-labels-idx1-ubyte'); test_data = load_save_function('t10k-images-idx3-ubyte'); test_result = load_save_label('t10k-labels-idx1-ubyte'); print train_data.shape train_data.tofile("mnist_train_data.db") data.saveData(train_result, "mnist_train_result.db") test_data.tofile("mnist_test_data.db") data.saveData(test_result, "mnist_test_result.db")