def run(): embedding_wrapper = EmbeddingWrapper('product') bc = BasketConstructor('./data/', './data/') ub_basket = bc.get_baskets('prior', reconstruct=False) ok, ub_basket = train_test_split(ub_basket, test_size=0.20, random_state=0) #embedding_wrapper = EmbeddingWrapper('tafeng_products') print(ub_basket) all_baskets = ub_basket.basket.values print(all_baskets) #changes every item to string print("nested change") all_baskets = nested_change(list(all_baskets), str) print("embedding_wrapper.remove_products_wo_embeddings(all_baskets)") all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets) print("uncommon products") all_baskets = remove_products_which_are_uncommon(all_baskets) print("short baskets") medium_baskets, all_baskets = remove_short_baskets(all_baskets) print(medium_baskets , all_baskets) print("nested change") all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f) print("split_data") train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data(all_baskets) print('knndtw') knndtw = KnnDtw(n_neighbors=[5]) preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, embedding_wrapper.basket_dist_REMD) print(preds_all) print(distances) #print("Wasserstein distance", sum(distances)/len(distances)) return preds_all, distances
def fit_clean(data): ''' Fit the model without any principal components ''' # split the data into training and testing train_x, train_y, \ test_x, test_y, \ labels = hlp.split_data( data, y = 'credit_application' ) # fit the model and print the summary class_fit_predict_print((train_x, train_y, test_x, test_y))
def __init__(self): with open("helper/dictionary.pickle", "rb") as f: dictionary = pickle.load(f) vocab_size = len(dictionary) # Load data -> change to mongoDb Connector later data = pd.read_csv("data_csv/data") y = data.point with open("features/tf_idf_vectorizer.pickle", "rb") as f: vectorizer = pickle.load(f) corpus = data[['project', 'concat']] analyzer = vectorizer.build_analyzer() corpus['concat'] = corpus.apply(lambda x: analyzer(x[1]), axis=1) tmp = corpus.apply(lambda x: len(x[1]), axis=1) sentence_size = max(tmp) corpus['concat'] = corpus.apply( lambda x: [dictionary.get(i) for i in x[1]], axis=1) x_train, x_test, y_train, y_test = split_data(corpus, y, ratio=0.2) pad_id = 0 x_train = sequence.pad_sequences(x_train['concat'].values, maxlen=sentence_size, truncating='post', padding='post', value=pad_id) x_test = sequence.pad_sequences(x_test['concat'].values, maxlen=sentence_size, truncating='post', padding='post', value=pad_id) self.vocab_size = vocab_size self.sentence_size = sentence_size self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test
def lightGBMmodel(size, x): data_csv = pd.read_csv("data_csv/data", low_memory=False) #x = sparse.load_npz("features/tf_idf_matrix.npz") # x = pd.read_csv("features/word2vec_ave_"+str(size)+".csv",index_col=0,low_memory=False) #x = pd.read_csv("features/doc2vec.csv",index_col=0) y = data_csv.point x_train, x_test, y_train, y_test = split_data(x, y, ratio=0.2) lgbm = LGBMRegressor(random_state=99) lgbm.fit(x_train.iloc[:, :-1], y_train) y_pred = lgbm.predict(x_test.iloc[:, :-1]) result_total = dict() result_total['MSE'] = mean_squared_error(y_pred, y_test) result_total['MAE'] = mean_absolute_error(y_pred, y_test) result_total['MdAE'] = median_absolute_error(y_pred, y_test) print("Mean Absolute Error: ", mean_absolute_error(y_pred, y_test)) print("Median Absolute Error: ", median_absolute_error(y_pred, y_test)) print("Mean Squared Error: ", mean_squared_error(y_pred, y_test)) print("Evaluation on each project") tmp = pd.DataFrame() tmp['project'] = x_test['project'] tmp['pred'] = y_pred tmp['truth'] = y_test result_each = tmp.groupby(by='project').apply( lambda col: mean_squared_error(col.pred, col.truth)).to_frame( name='MSE') result_each['MAE'] = tmp.groupby(by='project').apply( lambda col: mean_absolute_error(col.pred, col.truth)) result_each['MdAE'] = tmp.groupby(by='project').apply( lambda col: median_absolute_error(col.pred, col.truth)) return (result_each, result_total)
def run(): embedding_wrapper = EmbeddingWrapper('product') bc = BasketConstructor('./data/', './data/') ub_basket = bc.get_baskets('prior', reconstruct=False) all_baskets = ub_basket.basket.values all_baskets = nested_change(list(all_baskets), str) all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets) all_baskets = remove_products_which_are_uncommon(all_baskets) all_baskets = remove_short_baskets(all_baskets) all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f) train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data( all_baskets) knndtw = KnnDtw(n_neighbors=[5]) preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, embedding_wrapper.basket_dist_REMD) return preds_all, distances
def prepare_data(data, principal_components, n_components): ''' Prepare the data for the classification ''' # prepare the column names cols = ['pc' + str(i) for i in range(0, n_components)] # concatenate the data data = pd.concat( [data, pd.DataFrame(principal_components, columns=cols)], axis=1, join_axes=[data.index]) # split the data into training and testing train_x, train_y, \ test_x, test_y, \ labels = hlp.split_data( data, y = 'credit_application', x = cols ) return (train_x, train_y, test_x, test_y)
import pandas as pd import sklearn.svm as sv import helper as hp @hp.timeit def fitsvm(data): svm = sv.SVC(kernel='rbf', C=20.0, gamma=0.1) return svm.fit(data[0], data[1]) csv_data = pd.read_csv( "C:/Users/DELL/Desktop/niit/3rd semester/bank_contacts.csv") train_x, train_y, test_x, test_y, labels = hp.split_data( csv_data, y='credit_application') classifier = fitsvm((train_x, train_y)) predicted = classifier.predict(test_x) hp.printModelSummary(test_y, predicted) print(classifier.support_vectors_) import pandas as pd import sklearn.svm as sv import helper as hp """ @ is the decorator to call a function from refered module """ @hp.timeit def fitSVM(data): #creating the classifier object
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ for task_id in train_data.keys(): print "task_id:", task_id # split data for training weak_learners and boosting (train_weak, train_boosting) = split_data(train_data, 4) # train on first part of dataset (evaluate on other) prepared_data_weak = PreparedMultitaskData(train_weak, shuffle=False) classifiers = self._inner_train(prepared_data_weak, param) # train on entire dataset prepared_data_final = PreparedMultitaskData(train_data, shuffle=False) final_classifiers = self._inner_train(prepared_data_final, param) print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" print "done training weak learners" ##################################################### # perform boosting and wrap things up ##################################################### # wrap up predictors for later use predictors = {} for task_name in train_boosting.keys(): instances = train_boosting[task_name] N = len(instances) F = len(classifiers) examples = [inst.example for inst in instances] labels = [inst.label for inst in instances] # dim = (F x N) out = cvxmod.zeros((N,F)) for i in xrange(F): svm = classifiers[i] tmp_out = self._predict_weak(svm, examples, prepared_data_weak.name_to_id(task_name), param) if param.flags["signum"]: out[:,i] = numpy.sign(tmp_out) else: out[:,i] = tmp_out if param.flags["boosting"] == "ones": weights = numpy.ones(F)/float(F) if param.flags["boosting"] == "L1": weights = solve_boosting(out, labels, param.transform, solver="glpk") if param.flags["boosting"] == "L2": weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=False) if param.flags["boosting"] == "L2_reg": weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=True) predictors[task_name] = (final_classifiers, weights, prepared_data_final.name_to_id(task_name), param) assert prepared_data_final.name_to_id(task_name)==prepared_data_weak.name_to_id(task_name), "name mappings don't match" ##################################################### # Some sanity checks ##################################################### # make sure we have the same keys (potentiall in a different order) sym_diff_keys = set(train_weak.keys()).symmetric_difference(set(predictors.keys())) assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys) return predictors
validationProportion=0.25) # and return the regressor return ann # the file name of the dataset r_filename = '../../Data/Chapter06/power_plant_dataset_pc.csv' # read the data csv_read = pd.read_csv(r_filename) # split the data into training and testing train_x, train_y, \ test_x, test_y, \ labels = hlp.split_data(csv_read, y='net_generation_MWh', x=['total_fuel_cons_mmbtu']) # create the ANN training and testing datasets training = hlp.prepareANNDataset((train_x, train_y), prob='regression') testing = hlp.prepareANNDataset((test_x, test_y), prob='regression') # train the model regressor = fitANN(training) # predict the output from the unseen data predicted = regressor.activateOnDataset(testing) # and calculate the R^2 score = hlp.get_score(test_y, predicted[:, 0]) print('R2: ', score)
# fit the data return logistic_classifier.fit() # the file name of the dataset r_filename = '../../Data/Chapter03/bank_contacts.csv' # read the data csv_read = pd.read_csv(r_filename) # split the data into training and testing train_x, train_y, \ test_x, test_y, \ labels = hlp.split_data( csv_read, y = 'credit_application' ) # train the model classifier = fitLogisticRegression((train_x, train_y)) # classify the unseen data predicted = classifier.predict(test_x) # assign the class predicted = [1 if elem > 0.5 else 0 for elem in predicted] # print out the results hlp.printModelSummary(test_y, predicted) # print out the parameters
@hlp.timeit def fitNaiveBayes(data): ''' Build the Naive Bayes classifier ''' # create the classifier object naiveBayes_classifier = nb.GaussianNB() # fit the model return naiveBayes_classifier.fit(data[0], data[1]) # the file name of the dataset r_filename = '../../Data/Chapter03/bank_contacts.csv' # read the data csv_read = pd.read_csv(r_filename) # split the data into training and testing train_x, train_y, \ test_x, test_y, \ labels = hlp.split_data( csv_read, y = 'credit_application') # train the model classifier = fitNaiveBayes((train_x, train_y)) # classify the unseen data predicted = classifier.predict(test_x) # print out the results hlp.printModelSummary(test_y, predicted)
# the file name of the dataset r_filename = '../../Data/Chapter05/bank_contacts.csv' # read the data csv_read = pd.read_csv(r_filename) # split into independent and dependent features x = csv_read[csv_read.columns[:-1]] y = csv_read[csv_read.columns[-1]] # split the original data into training and testing train_x_orig, train_y_orig, \ test_x_orig, test_y_orig, \ labels_orig = hlp.split_data( csv_read, y = 'credit_application' ) # reduce the dimensionality csv_read['reduced'] = reduce_LDA(x, y).transform(x) # split the reduced data into training and testing train_x_r, train_y_r, \ test_x_r, test_y_r, \ labels_r = hlp.split_data( csv_read, y = 'credit_application', x = ['reduced'] ) # train the models
# and return the classifier return ann # the file name of the dataset r_filename = '../../Data/Chapter03/bank_contacts.csv' # read the data csv_read = pd.read_csv(r_filename) # split the data into training and testing train_x, train_y, \ test_x, test_y, \ labels = hlp.split_data( csv_read, y = 'credit_application', x = ['n_duration','n_euribor3m','n_age','n_emp_var_rate','n_pdays','month_mar','prev_ctc_outcome_success','n_cons_price_idx','month_apr','n_cons_conf_idx'] ) # create the ANN training and testing datasets training = hlp.prepareANNDataset((train_x, train_y)) testing = hlp.prepareANNDataset((test_x, test_y)) # train the model classifier = fitANN(training) # classify the unseen data predicted = classifier.activateOnDataset(testing) # the lowest output activation gives the class predicted = predicted.argmin(axis=1)
# the file name of the dataset r_filename = '../../Data/Chapter03/bank_contacts.csv' # read the data csv_read = pd.read_csv(r_filename) # split the data into training and testing train_x, train_y, \ test_x, test_y, \ labels = hlp.split_data( csv_read, y = 'credit_application', x = ['n_duration','n_nr_employed', 'prev_ctc_outcome_success','n_euribor3m', 'n_cons_conf_idx','n_age','month_oct', 'n_cons_price_idx','edu_university_degree','n_pdays', 'dow_mon','job_student','job_technician', 'job_housemaid','edu_basic_6y'] ) # train the model classifier = fitGradientBoosting((train_x, train_y)) # classify the unseen data predicted = classifier.predict(test_x) # print out the results hlp.printModelSummary(test_y, predicted) # print out the importance of features
# In[3]: # Loading dataset filename = "Clean_Akosombo_data.csv" akosombo = helper.load_csv_data(filename) # ### Splitting Data # In[4]: # Splitting dataset target_variable = "generation" X, y, X_train, X_test, X_val, y_train, y_test, y_val = helper.split_data(akosombo, target_variable, validation_data=True) # ### Scaling Data # In[5]: # Data Scaling X_train, X_test, X_val = helper.scale(X_train, X_test, X_val, scale_validation=True) # ### Model Creation # In[9]:
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # split data for training weak_learners and boosting (train_weak, train_boosting) = split_data(train_data, 4) for task_id in train_data.keys(): print "task_id:", task_id root = param.taxonomy.data # train on first part of dataset (evaluate on other) (classifiers, classifier_at_node) = self._inner_train(train_weak, param) # train on entire dataset (final_classifiers, final_classifier_at_node) = self._inner_train(train_data, param) ### print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" print "done training weak learners" ##################################################### # perform boosting and wrap things up ##################################################### # wrap up predictors for later use predictors = {} for task_name in train_boosting.keys(): instances = train_boosting[task_name] # get ids of predecessor nodes node_names = [node.name for node in root.get_node(task_name).get_path_root()] node_names.append(task_name) print "node: %s --> %s" % (task_name, str(node_names)) N = len(instances) if param.flags["use_all_nodes"]: # use classifiers only from parent nodes F = len(classifiers) tmp_classifiers = classifiers tmp_final_classifiers = final_classifiers else: # use classifiers from all leaves F = len(node_names) tmp_classifiers = [] tmp_final_classifiers = [] examples = [inst.example for inst in instances] labels = [inst.label for inst in instances] # dim = (F x N) out = cvxmod.zeros((N,F)) for i in xrange(F): if param.flags["use_all_nodes"]: svm = classifiers[i] else: svm = classifier_at_node[node_names[i]] tmp_classifiers.append(svm) final_svm = final_classifier_at_node[node_names[i]] tmp_final_classifiers.append(final_svm) tmp_out = self._predict_weak(svm, examples, task_name) if param.flags["signum"]: out[:,i] = numpy.sign(tmp_out) else: out[:,i] = tmp_out if param.flags["boosting"] == "ones": weights = numpy.ones(F)/float(F) if param.flags["boosting"] == "L1": weights = solve_boosting(out, labels, param.transform, solver="glpk") if param.flags["boosting"] == "L2": weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=False) if param.flags["boosting"] == "L2_reg": weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=True) predictors[task_name] = (tmp_final_classifiers, weights) ##################################################### # Some sanity checks ##################################################### # make sure we have the same keys (potentiall in a different order) sym_diff_keys = set(train_weak.keys()).symmetric_difference(set(predictors.keys())) assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys) # save graph plot mypath = "/fml/ag-raetsch/share/projects/multitask/graphs/" filename = mypath + "graph_" + str(param.id) root.plot(filename)#, plot_cost=True, plot_B=True) return predictors
# the file name of the dataset r_filename = '../../Data/Chapter05/bank_contacts.csv' # read the data csv_read = pd.read_csv(r_filename) # split into independent and dependent features x = csv_read[csv_read.columns[:-1]] y = csv_read[csv_read.columns[-1]] # split the original data into training and testing train_x_orig, train_y_orig, \ test_x_orig, test_y_orig, \ labels_orig = hlp.split_data( csv_read, y = 'credit_application' ) # reduce the dimensionality csv_read['reduced'] = reduce_LDA(x, y).transform(x) # split the reduced data into training and testing train_x_r, train_y_r, \ test_x_r, test_y_r, \ labels_r = hlp.split_data( csv_read, y = 'credit_application', x = ['reduced'] ) # train the models
def runtopncustomers(): embedding_wrapper = EmbeddingWrapper('product') bc = BasketConstructor('./data/', './data/') ub_basket = bc.get_baskets('prior', reconstruct=False) ok, ub_basket = train_test_split(ub_basket, test_size=0.20, random_state=0) #embedding_wrapper = EmbeddingWrapper('tafeng_products') #print(ub_basket) all_baskets = ub_basket.basket.values #print(all_baskets) #changes every item to string print("nested change") all_baskets = nested_change(list(all_baskets), str) print("embedding_wrapper.remove_products_wo_embeddings(all_baskets)") all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets) #print('test' ,all_baskets) #every customer sequence for s in range(2): print(all_baskets[s]) #itemperklant = np.array([]) itemperklant = [] sizes = [] top_nc = get_top_nc(all_baskets, 2) for i in range(len(all_baskets[s])): # every basket in all baskets for j in range(len(all_baskets[s][i])): # every item in every basket #print('basket', all_baskets[s][i][j]) itemperklant.append( all_baskets[s][i][j]) print(itemperklant) unique_items = np.unique(itemperklant) print(unique_items) arrayklant = np.zeros((int(len(unique_items)), 2)) arrayklant[:, 0] = unique_items for ding in range(len(unique_items)): countproduct = itemperklant.count(unique_items[ding]) # itemperklant.append(countproduct) arrayklant[ding, 1] = countproduct print(arrayklant) sorted = arrayklant[np.argsort(arrayklant[:, 1])] print('sorted', sorted) product = np.array([]) print('average length', top_nc[s]) for reverse in range(int(top_nc[s])): print ('test', sorted[-reverse - 1, :]) product = np.append(product, sorted[-reverse, :]) #print("uncommon products") #all_baskets = remove_products_which_are_uncommon(all_baskets) #print("short baskets") #medium_baskets, all_baskets = remove_short_baskets(all_baskets) #print(medium_baskets , all_baskets) #print("nested change") #all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f) #print("split_data") train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data(all_baskets) #print('knndtw') #knndtw = KnnDtw(n_neighbors=[5]) #preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, embedding_wrapper.basket_dist_REMD) #print(preds_all) #print(distances) #print("Wasserstein distance", sum(distances)/len(distances)) #return preds_all, distances write_path = 'data/testprint' with open(write_path + '.txt', 'w') as results: results.write('All baskets test ' + str(all_baskets) + '\n') results.close()
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ for task_id in train_data.keys(): print "task_id:", task_id # split data for training weak_learners and boosting (train_weak, train_boosting) = split_data(train_data, 4) # train on first part of dataset (evaluate on other) prepared_data_weak = PreparedMultitaskData(train_weak, shuffle=False) classifiers = self._inner_train(prepared_data_weak, param) # train on entire dataset prepared_data_final = PreparedMultitaskData(train_data, shuffle=False) final_classifiers = self._inner_train(prepared_data_final, param) print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" print "done training weak learners" ##################################################### # perform boosting and wrap things up ##################################################### # wrap up predictors for later use predictors = {} for task_name in train_boosting.keys(): instances = train_boosting[task_name] N = len(instances) F = len(classifiers) examples = [inst.example for inst in instances] labels = [inst.label for inst in instances] # dim = (F x N) out = cvxmod.zeros((N,F)) for i in xrange(F): svm = classifiers[i] tmp_out = self._predict_weak(svm, examples, prepared_data_weak.name_to_id(task_name)) if param.flags["signum"]: out[:,i] = numpy.sign(tmp_out) else: out[:,i] = tmp_out if param.flags["boosting"] == "ones": weights = numpy.ones(F)/float(F) if param.flags["boosting"] == "L1": weights = solve_boosting(out, labels, param.transform, solver="glpk") if param.flags["boosting"] == "L2": weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=False) if param.flags["boosting"] == "L2_reg": weights = solve_nu_svm(out, labels, param.transform, solver="glpk", reg=True) predictors[task_name] = (final_classifiers, weights, prepared_data_final.name_to_id(task_name)) assert prepared_data_final.name_to_id(task_name)==prepared_data_weak.name_to_id(task_name), "name mappings don't match" ##################################################### # Some sanity checks ##################################################### # make sure we have the same keys (potentiall in a different order) sym_diff_keys = set(train_weak.keys()).symmetric_difference(set(predictors.keys())) assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys) return predictors
# In[3]: # Loading dataset filename = "Clean_Akosombo_data.csv" akosombo = helper.load_csv_data(filename) # ### Splitting the Dataset # In[4]: # Splitting dataset target_variable = "generation" X, y, X_train, X_test, y_train, y_test = helper.split_data(akosombo, target_variable) # ### Scaling the Dataset # In[5]: # Data Scaling X_train, X_test = helper.scale(X_train, X_test) # ### Chosing Baseline Models and Training Models # In[6]:
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # split for training weak_learners and boosting (train_weak, train_boosting) = split_data(train_data, 4) # merge data sets data = PreparedMultitaskData(train_weak, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ################################################## # define pockets ################################################## pockets = [0]*9 pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34] pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31] pockets[2] = [11, 20, 21, 22, 29, 31] pockets[3] = [8, 30, 31, 32] pockets[4] = [10, 11, 30] pockets[5] = [10, 11, 12, 13, 20, 29] pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29] pockets[7] = [12, 14, 15, 26] pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26] pockets = [] for i in xrange(35): pockets.append([i]) #new_pockets = [] # merge neighboring pockets #for i in range(8): # new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1])))) #pockets = new_pockets ######################################################## print "creating a kernel:" ######################################################## # init seq handler pseudoseqs = SequencesHandler() classifiers = [] for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel = shogun_factory.create_kernel(data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) print "training SVM for pocket", pocket svm = self._train_single_svm(param, kernel, lab) classifiers.append(svm) print "done obtaining weak learners" # save additional info #self.additional_information["svm_objective"] = svm.get_objective() #self.additional_information["svm num sv"] = svm.get_num_support_vectors() #self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights() #print self.additional_information ################################################## # combine weak learners for each task ################################################## # set constants some = 0.9 import cvxmod # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_boosting.keys(): instances = train_boosting[task_name] N = len(instances) F = len(pockets) examples = [inst.example for inst in instances] labels = [inst.label for inst in instances] # dim = (F x N) out = cvxmod.zeros((N,F)) for i in xrange(F): svm = classifiers[i] tmp_out = self._predict_weak(svm, examples, data.name_to_id(task_name)) out[:,i] = numpy.sign(tmp_out) #out[:,i] = tmp_out #TODO: fix helper.save("/tmp/out_sparse", (out,labels)) pdb.set_trace() weights = solve_boosting(out, labels, some, solver="mosek") svms[task_name] = (data.name_to_id(task_name), svm) return svms
trainer.trainUntilConvergence(maxEpochs=50, verbose=True, continueEpochs=2, validationProportion=0.25) # and return the regressor return ann # the file name of the dataset r_filename = '../../Data/Chapter06/power_plant_dataset_pc.csv' # read the data csv_read = pd.read_csv(r_filename) # split the data into training and testing train_x, train_y, \ test_x, test_y, \ labels = hlp.split_data(csv_read, y='net_generation_MWh', x=['total_fuel_cons_mmbtu']) # create the ANN training and testing datasets training = hlp.prepareANNDataset((train_x, train_y), prob='regression') testing = hlp.prepareANNDataset((test_x, test_y), prob='regression') # train the model regressor = fitANN(training) # predict the output from the unseen data predicted = regressor.activateOnDataset(testing) # and calculate the R^2 score = hlp.get_score(test_y, predicted[:, 0])
# Accuracy correct_pred = tf.equal(tf.argmax(logits, axis=1), tf.argmax(y, axis=1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) ############ DATA ############ # inputs: two integers # targets: [0, 1] if the sum is higher than 10. [1, 0] if the sum is lower than 10. inputs, targets = get_data(max_int=10, size=10000) # preprocessing: normalize inputs to be between -1 and 1. inputs = (inputs - 5) / 5 # TODO: make a preprocessing helper function, substracting the mean and dividing by the max value # split train and test data train_inputs, test_inputs, train_targets, test_targets = split_data( inputs, targets) ############ SESSION ############ with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # TRAINING for epoch in range(epochs): batch_x, batch_y = get_batches(train_inputs, train_targets, batch_size) for batch in range(train_inputs.shape[0] // batch_size): sess.run(optimizer, feed_dict={ x: batch_x[batch],
conditions = pd.read_csv(PATH_DATA + 'conditions_mergedGenes.tsv', sep='\t', index_col=None) STRAIN_ORDER = pd.read_table(PATH_DATA + 'strain_order.tsv', header=None).values.ravel() # ***************** # *** Expression averaged across replicates of strains by timepoint # *** Average RPKUM data of strains across timepoints # Combine expression and metadata merged = merge_genes_conditions( genes=genes, conditions=conditions[['Time', 'Measurment', 'Strain']], matching='Measurment') splitted = split_data(data=merged, split_by='Strain') # Average by strain by_strain = {} for split, data in splitted.items(): genes_avg_stage = data.groupby('Time').mean() by_strain[split] = genes_avg_stage.T # Combine data of different strains and add metadata strains_data = [] for strain, data in by_strain.items(): strain_data = pd.DataFrame({ 'Strain': [strain] * data.shape[1] }, index=data.columns).T.append(data) strains_data.append(strain_data) genes_avg = pd.concat(strains_data, axis=1).T genes_avg['Time'] = genes_avg.index
return forest.fit(data[0],data[1]) # the file name of the dataset r_filename = '../../Data/Chapter03/bank_contacts.csv' # read the data csv_read = pd.read_csv(r_filename) # split the data into training and testing train_x, train_y, \ test_x, test_y, \ labels = hlp.split_data( csv_read, y = 'credit_application', x = ['n_duration','n_nr_employed', 'prev_ctc_outcome_success','n_euribor3m', 'n_cons_conf_idx','n_age','month_oct', 'n_cons_price_idx','edu_university_degree','n_pdays', 'dow_mon','job_student','job_technician', 'job_housemaid','edu_basic_6y'] ) # train the model classifier = fitRandomForest((train_x, train_y)) # classify the unseen data predicted = classifier.predict(test_x) # print out the results hlp.printModelSummary(test_y, predicted) # print out the importance of features
VOCAB_SIZE_ENCODER = len(metadata['idx2w']) VOCAB_SIZE_DECODER = VOCAB_SIZE_ENCODER # since same language EMBED_DIMS = 200 HIDDEN_UNITS = 200 NUMBER_OF_LAYERS = 3 LEARNING_RATE = 0.001 DROPOUT = 0.5 BATCH_SIZE = 32 TEST_EPOCHS = 1 SAVED_MODEL_DIR = 'saved_model_seq2seq' # shuffling data idxQ, idxA = helper.shuffle_data(idxQ, idxA) # splitting data into train, test, validation trainX, trainY, testX, testY, valX, valY = helper.split_data( idxQ, idxA, TRAIN_DATA_PERCENT, TEST_DATA_PERCENT, VAL_DATA_PERCENT) # creating model class object model = seq2seq_model(vocabSizeEncoder=VOCAB_SIZE_ENCODER, vocabSizeDecoder=VOCAB_SIZE_DECODER, maxLenX=MAX_LEN_X, maxLenY=MAX_LEN_Y, embedDims=EMBED_DIMS, numLayers=NUMBER_OF_LAYERS, hiddenUnits=HIDDEN_UNITS, lr=LEARNING_RATE) # re- building tensorflow graph model.build_model_graph()
from sklearn.preprocessing import StandardScaler from sklearn.svm import SVR from helper import get_data, split_data, visualize name = 'Support vector' if __name__ == '__main__': x, y = get_data() y = y.reshape(len(y), 1) x_train, x_test, y_train, y_test = split_data(x, y) x_scaler = StandardScaler() y_scaler = StandardScaler() x_train = x_scaler.fit_transform(x_train) x_test = x_scaler.transform(x_test) y_train = y_scaler.fit_transform(y_train) y_test = y_scaler.transform(y_test) regression = SVR(kernel='rbf') regression.fit(x_train, y_train) y_predicted = regression.predict(x_test) y_predicted = y_scaler.inverse_transform(y_predicted) y_test = y_scaler.inverse_transform(y_test) visualize(y_test, y_predicted, name)
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # split for training weak_learners and boosting (train_weak, train_boosting) = split_data(train_data, 4) # merge data sets data = PreparedMultitaskData(train_weak, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ######################################################## print "creating a kernel:" ######################################################## # init seq handler pseudoseqs = SequencesHandler() classifiers = [] for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel = shogun_factory.create_kernel(data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) print "training SVM for pocket", pocket svm = self._train_single_svm(param, kernel, lab) classifiers.append(svm) print "done obtaining weak learners" # save additional info #self.additional_information["svm_objective"] = svm.get_objective() #self.additional_information["svm num sv"] = svm.get_num_support_vectors() #self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights() #print self.additional_information ################################################## # combine weak learners for each task ################################################## # set constants some = 0.9 import cvxmod # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_boosting.keys(): instances = train_boosting[task_name] N = len(instances) F = len(pockets) examples = [inst.example for inst in instances] labels = [inst.label for inst in instances] # dim = (F x N) out = cvxmod.zeros((N,F)) for i in xrange(F): svm = classifiers[i] tmp_out = self._predict_weak(svm, examples, data.name_to_id(task_name)) out[:,i] = numpy.sign(tmp_out) #out[:,i] = tmp_out #TODO: fix helper.save("/tmp/out_sparse", (out,labels)) pdb.set_trace() weights = solve_boosting(out, labels, some, solver="mosek") svms[task_name] = (data.name_to_id(task_name), svm) return svms