def test_entropy(self): obj_tree = tree.DecisionTree() test_arr = [] test_arr.append({ 'arr': np.array([1 for x in range(9)] + [0 for x in range(11)]), 'answer': 0.993 }) test_arr.append({ 'arr': np.array([1 for x in range(8)] + [0 for x in range(5)]), 'answer': 0.961 }) test_arr.append({ 'arr': np.array([1 for x in range(1)] + [0 for x in range(6)]), 'answer': 0.592 }) test_arr.append({'arr': np.array([1, 2, 3, 4, 5, 6]), 'answer': 2.585}) for x in test_arr: val = obj_tree._entropy(x['arr']) self.assertEqual(round(val, 3), x['answer'])
def train(self,X,Y,vX=None,vY=None): ''' Trains a RandomForest using the provided training set.. Input: --------- X: a m x d matrix of training data... Y: labels (m x 1) label matrix vX: a n x d matrix of validation data (will be used to stop growing the RF)... vY: labels (n x 1) label matrix Returns: ----------- ''' nexamples, nfeatures= X.shape self.findScalingParameters(X) if self.scalefeat: X=self.applyScaling(X) #print X.max(axis=0), X.min(axis=0) self.trees=[] #-----------------------TODO-----------------------# #--------Write Your Code Here ---------------------# while len(self.trees)<self.ntrees: dtnew=tree.DecisionTree(weaklearner=self.weaklearner, nsplits=self.nsplits, nfeattest=self.nfeattest) print 'training new tree' dtnew.train(X, Y) self.trees.append(dtnew)
def train(self,X,Y,vX=None,vY=None): ''' Trains a RandomForest using the provided training set.. Input: --------- X: a m x d matrix of training data... Y: labels (m x 1) label matrix vX: a n x d matrix of validation data (will be used to stop growing the RF)... vY: labels (n x 1) label matrix Returns: ----------- ''' nexamples, nfeatures= X.shape self.findScalingParameters(X) if self.scalefeat: X=self.applyScaling(X) self.trees=[] self.dT = tree.DecisionTree(weaklearner) #-----------------------TODO-----------------------# #--------Write Your Code Here ---------------------# #S_D,S_DY = split_data(X , Y , ba) for i range(0 , ntrees): self.trees[i] = dT.build_tree(X , Y)
def test_gini(self): obj_tree = tree.DecisionTree() test_arr = [] test_arr.append({ 'arr': np.array([1 for x in range(9)] + [0 for x in range(11)]), 'answer': 0.495 }) test_arr.append({ 'arr': np.array([1 for x in range(8)] + [0 for x in range(5)]), 'answer': 0.473 }) test_arr.append({ 'arr': np.array([1 for x in range(1)] + [0 for x in range(6)]), 'answer': 0.245 }) test_arr.append({'arr': np.array([1, 2, 3, 4, 5, 6]), 'answer': 0.833}) for x in test_arr: val = obj_tree._gini(x['arr']) self.assertEqual(round(val, 3), x['answer'])
def test_fit_and_predict(self): obj = tree.DecisionTree(min_samples_split=2, max_depth=1) X = np.array([[1, 2, 3], [0, 5, 6], [7, 8, 9], [2, 2, 4], [1, 1, 3]]) y = np.array([1, 1, 3, 4, 5]) obj.fit(X, y) X = np.array([[7, 8, 9]]) result_y = obj.predict(X) self.assertEqual(result_y[0], 1)
def test_chek_y(self): obj = tree.DecisionTree() X = np.array([[1, 2, 3], [0, 5, 6], [7, 8, 9], [2, 2, 4], [1, 1, 3]]) result = obj._chek_y(X) self.assertEqual(result, True) X = np.array([[1, 2, 1], [0, 5, 1], [7, 8, 1], [2, 2, 1], [1, 1, 1]]) result = obj._chek_y(X) self.assertEqual(result, False)
def fit(self, X, y): self.trees = [] for _ in range(self.n_trees): classifier = tree.DecisionTree(self.leaf_size, self.n_trials) self.trees.append(classifier) for i, classifier in enumerate(self.trees): classifier = classifier.fit(X, y) self.trees[i] = classifier return self
def test_mad_median(self): obj_tree = tree.DecisionTree() test_arr = [] test_arr.append({ 'arr': np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'answer': 2.222 }) test_arr.append({ 'arr': np.array([1, 2, 7, 4, 5, 6, 7, 8, 9]), 'answer': 2.111 }) for x in test_arr: val = obj_tree._mad_median(x['arr']) self.assertEqual(round(val, 3), x['answer'])
def test_variance(self): obj_tree = tree.DecisionTree() test_arr = [] test_arr.append({ 'arr': np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'answer': 6.667 }) test_arr.append({ 'arr': np.array([1, 2, 7, 4, 5, 6, 7, 8, 9]), 'answer': 6.469 }) for x in test_arr: val = obj_tree._variance(x['arr']) self.assertEqual(round(val, 3), x['answer'])
def train(self, X, Y, vX=None, vY=None): ''' Trains a RandomForest using the provided training set.. Input: --------- X: a m x d matrix of training data... Y: labels (m x 1) label matrix vX: a n x d matrix of validation data (will be used to stop growing the RF)... vY: labels (n x 1) label matrix Returns: ----------- ''' self.classes = np.unique(Y) nexamples, nfeatures = X.shape self.findScalingParameters(X) if self.scalefeat: X = self.applyScaling(X) self.trees = [] #-----------------------TODO-----------------------# #--------Write Your Code Here ---------------------# for i in range(0, self.ntrees): arranged = np.arange(0, nexamples) shuffled = np.random.shuffle(arranged) #print 'shape of Y',Y.shape #print 'shape of X',X.shape shuffledY = Y[shuffled] shuffledY = np.squeeze(shuffledY) print 'creating a tree' my_tree = tree.DecisionTree(maxdepth=self.treedepth, weaklearner=self.weaklearner, nsplits=self.nsplits, nfeattest=nfeatures) shuffledX = X[shuffled] shuffledX = np.squeeze(shuffledX) #print 'shape of sqX',shuffledX.shape #print 'shape of sqY',shuffledY.shape print 'training a tree' my_tree.train(shuffledX, shuffledY) print 'appending the created tree to the list' self.trees.append(my_tree)
def train(self,X,Y,vX=None,vY=None): nexamples, nfeatures= X.shape self.findScalingParameters(X) if self.scalefeat: X=self.applyScaling(X) self.trees=[] for i in range(self.ntrees): ShufflingIndexes=list(range(len(X))) rd.shuffle(ShufflingIndexes) x,y=X[ShufflingIndexes],Y[ShufflingIndexes] Indices=int(len(X)*(1-rd.uniform(0,0.4))) Sample_X,Sample_Y=x[:Indices,:],y[:Indices] Tree=tree.DecisionTree(0.95,5,self.treedepth,self.weaklearner) Tree.train(Sample_X,Sample_Y) self.trees.append(Tree)
def train_tree(self, X, Y, verbose=True): ''' Trains A tree based on given arguments return : the Decision Tree object ''' dt = tree.DecisionTree(exthreshold=10, maxdepth=self.treedepth, weaklearner=self.weaklearner, nsplits=self.nsplits) dt.verbose = verbose if self.usebagging: X_train, _, Y_train, _ = train_test_split( X, Y, train_size=self.baggingfraction) dt.train(X_train, Y_train) return dt dt.train(X, Y) return dt
def train(self, X, Y, vX=None, vY=None): ''' Trains a RandomForest using the provided training set.. Input: --------- X: a m x d matrix of training data... Y: labels (m x 1) label matrix vX: a n x d matrix of validation data (will be used to stop growing the RF)... vY: labels (n x 1) label matrix Returns: ----------- ''' nexamples, nfeatures = X.shape self.findScalingParameters(X) if self.scalefeat: X = self.applyScaling(X) self.trees = [] #-----------------------TODO-----------------------# #--------Write Your Code Here ---------------------# if (vX == None): for t in range(self.ntrees): myTree = tree.DecisionTree(purity=0.95, maxdepth=self.treedepth, weaklearner=self.weaklearner, nsplits=self.nsplits) myTree.train(X, Y) self.trees.append(myTree)
def __init__(self, ntrees=10,treedepth=5,usebagging=False,baggingfraction=0.6, weaklearner="Conic", nsplits=10, nfeattest=None, posteriorprob=False,scalefeat=True ): """ Build a random forest classification forest.... Input: --------------- ntrees: number of trees in random forest treedepth: depth of each tree usebagging: to use bagging for training multiple trees baggingfraction: what fraction of training set to use for building each tree, weaklearner: which weaklearner to use at each interal node, e.g. "Conic, Linear, Axis-Aligned, Axis-Aligned-Random", nsplits: number of splits to test during each feature selection round for finding best IG, nfeattest: number of features to test for random Axis-Aligned weaklearner posteriorprob: return the posteriorprob class prob scalefeat: wheter to scale features or not... """ self.ntrees=ntrees self.treedepth=treedepth self.usebagging=usebagging self.baggingfraction=baggingfraction self.weaklearner=weaklearner self.nsplits=nsplits self.nfeattest=nfeattest self.posteriorprob=posteriorprob self.scalefeat=scalefeat self.trees=[] for i in range(0,self.ntrees): self.trees.append(tree.DecisionTree(purity=0.95,maxdepth=5,weaklearner=self.weaklearner)) pass
def train(self,X,Y,vX=None,vY=None): ''' Trains a RandomForest using the provided training set.. Input: --------- X: a m x d matrix of training data... Y: labels (m x 1) label matrix vX: a n x d matrix of validation data (will be used to stop growing the RF)... vY: labels (n x 1) label matrix Returns: ----------- ''' nexamples, nfeatures= X.shape self.findScalingParameters(X) if self.scalefeat: X=self.applyScaling(X) self.trees=[] #-----------------------TODO-----------------------# #--------Write Your Code Here ---------------------# self.classes = np.unique(Y) for i in range(self.ntrees): sort = np.arange(0,nexamples) mixed_val = np.random.shuffle(sort) mixed_valX = np.squeeze(X[mixed_val]) mixed_valY = np.squeeze(Y[mixed_val]) dt = tree.DecisionTree(purity=0.9,maxdepth=self.treedepth,weaklearner=self.weaklearner,nsplits=self.nsplits,nfeattest=nfeatures) dt.train(mixed_valX,mixed_valY) self.trees.append(dt)
reload(load_kc_data) import tree reload(tree) import svm reload(svm) import linear reload(linear) import ensemble reload(ensemble) os.chdir('My/Path/Here') train_X, train_y, dev_X, dev_y, test_X, test_y = load_kc_data.load_kc_housing() # Train models tree_model = tree.DecisionTree(train_X, train_y, dev_X, dev_y, test_X, test_y) linear_model = linear.LinearRegression(train_X, train_y, dev_X, dev_y, test_X, test_y) # Results def evaluate_model(clf,X,y): y_pred = clf.predict(X) rms = sklearn.metrics.mean_squared_error(y,y_pred) print "The model's RMS is " + str(rms) + ", which is " + str(100*rms/np.var(y)) + "% of data variance." print "Decision Tree:" evaluate_model(tree_model, test_X, test_y) print "\nLinear Regression" evaluate_model(linear_model, test_X, test_y) # Averaging the results of the two models ensemble_rms = ensemble.Ensemble(test_X, test_y, [tree_model,linear_model])
def nFoldValidationIrisData(n, x, y): # We know we have balanced and sorted data # so we split them up so we can equal distribute them into n-buckets list_of_data_x, list_of_data_y = splitDatainNBuckets(n, x, y) rows, _ = np.shape(list_of_data_x[0]) depth = 30 iterations = 100 indexes_shuffle = np.arange(rows) num_of_training_data = math.ceil(rows * 4 / 5) avg_acc_test = [0] * depth avg_acc_train = [0] * depth for _ in range(iterations): # shuffle the data 100 times for each of the buckets # Shuffle the rows the same for each of the datasets np.random.shuffle(indexes_shuffle) for j in range(len(list_of_data_x)): list_of_data_x[j] = list_of_data_x[j][indexes_shuffle, :] list_of_data_y[j] = list_of_data_y[j][indexes_shuffle, :] for d in range(depth): # for each depth calculate the accuracy for j in range(len(list_of_data_x)): # For each of the buckets with data # take out random data for training and testing training_data_x = list_of_data_x[j][0:num_of_training_data, :] training_data_y = list_of_data_y[j][0:num_of_training_data, :] testing_data_x = list_of_data_x[j][num_of_training_data:, :] testing_data_y = list_of_data_y[j][num_of_training_data:, :] #Train dTree = tree.DecisionTree(phase='Training', x=training_data_x, y=training_data_y, depth=d) #Test classified_y = tree.DecisionTree(phase='Validation', x=testing_data_x, tree=dTree) accuracy = tree.calculateAccuracy(classified_y, testing_data_y) avg_acc_test[d] += accuracy classified_y_train = tree.DecisionTree(phase='Validation', x=training_data_x, tree=dTree) accuracy_train = tree.calculateAccuracy( classified_y_train, training_data_y) avg_acc_train[d] += accuracy_train avg_acc_test = [ x / (iterations * len(list_of_data_x)) for x in avg_acc_test ] avg_acc_train = [ x / (iterations * len(list_of_data_x)) for x in avg_acc_train ] return ( avg_acc_test, avg_acc_train, )
def fit(self): z_bound_left, z_bound_right = hyperplane.get_bounds(self.c_hyperplanes) phi_opt = self.born_again(z_bound_left, z_bound_right) return tree.DecisionTree( self.extract_optimal_solutions(z_bound_left, z_bound_right, phi_opt), self.columns)
def __init__(self): self.b = bayes.Bayes() self.t = tree.DecisionTree()
def trainGenreID(documents): data, labels = dataGenre(documents) s = tr.DecisionTree() s.train(data, labels, 10) return s
def trainYearID(documents): data, labels = dataYear(documents) s = tr.DecisionTree() s.train(data, labels, 10) return s
def trainTopID(documents): data, labels = dataTop(documents) s = tr.DecisionTree() s.train(data, labels, 30) return s
def trainBottomID(documents): data, labels = dataBottom(documents) s = tr.DecisionTree() s.train(data, labels, 30) return s
def main(data_type, class_labels_fn, class_names_fn, ft_names_fn, max_depth, limit_entities, limited_label_fn, vector_names_fn, dt_dev, doc_topic_prior, topic_word_prior, n_topics, file_name, final_csv_name, high_amt, low_amt, cross_val, rewrite_files, classify, tf_fn): print("importing class all") tf = np.asarray( sp.load_npz("../data/" + data_type + "/bow/frequency/phrases/" + tf_fn).todense()) names = dt.import1dArray(ft_names_fn) variables_to_execute = list( product(doc_topic_prior, topic_word_prior, n_topics)) print("executing", len(variables_to_execute), "variations") csvs = [] csv_fns = [] file_names = [] for vt in variables_to_execute: doc_topic_prior = vt[0] topic_word_prior = vt[1] n_topics = vt[2] file_names.append(file_name + "DTP" + str(doc_topic_prior) + "TWP" + str(topic_word_prior) + "NT" + str(n_topics)) final_csv_fn = "../data/" + data_type + "/rules/tree_csv/" + file_name + final_csv_name + ".csv" for vt in range(len(variables_to_execute)): doc_topic_prior = variables_to_execute[vt][0] topic_word_prior = variables_to_execute[vt][1] n_topics = variables_to_execute[vt][2] file_name = file_names[vt] LDA(tf, names, n_topics, file_name, doc_topic_prior, topic_word_prior, data_type, rewrite_files) dimension_names_fn = "../data/" + data_type + "/LDA/names/" + file_name + ".txt" #NMFFrob(dt.import2dArray("../data/"+data_type+"/bow/ppmi/class-all-100-10-all"), dt.import1dArray("../data/"+data_type+"/bow/names/100.txt"), 200, file_name) topic_model_fn = "../data/" + data_type + "/LDA/rep/" + file_name + ".txt" cv_fns = [] og_fn = file_name for c in range(cross_val): file_name = og_fn + " " + str(cross_val) + "CV " + str( c) + classify + "Dev" + str(dt_dev) csv_name = "../data/" + data_type + "/rules/tree_csv/" + file_name + ".csv" cv_fns.append(csv_name) tree.DecisionTree(topic_model_fn, class_labels_fn, class_names_fn, dimension_names_fn, file_name, 10000, max_depth=1, balance="balanced", criterion="entropy", save_details=False, cv_splits=cross_val, split_to_use=c, data_type=data_type, csv_fn=csv_name, rewrite_files=rewrite_files, development=dt_dev, limit_entities=limit_entities, limited_label_fn=limited_label_fn, vector_names_fn=vector_names_fn, clusters_fn=topic_model_fn, cluster_duplicates=True, save_results_so_far=False) tree.DecisionTree(topic_model_fn, class_labels_fn, class_names_fn, dimension_names_fn, file_name, 10000, max_depth=max_depth, balance="balanced", criterion="entropy", save_details=False, cv_splits=cross_val, split_to_use=c, data_type=data_type, csv_fn=csv_name, rewrite_files=rewrite_files, development=dt_dev, limit_entities=limit_entities, limited_label_fn=limited_label_fn, vector_names_fn=vector_names_fn, clusters_fn=topic_model_fn, cluster_duplicates=True, save_results_so_far=False) tree.DecisionTree(topic_model_fn, class_labels_fn, class_names_fn, dimension_names_fn, file_name + "None", 10000, max_depth=None, balance="balanced", criterion="entropy", save_details=False, data_type=data_type, csv_fn=csv_name, rewrite_files=rewrite_files, cv_splits=cross_val, split_to_use=c, development=dt_dev, limit_entities=limit_entities, limited_label_fn=limited_label_fn, vector_names_fn=vector_names_fn, clusters_fn=topic_model_fn, cluster_duplicates=True, save_results_so_far=False) dt.averageCSVs(cv_fns) file_name = og_fn + " " + str(cross_val) + "CV " + str( 0) + classify + "Dev" + str(dt_dev) csvs.append("../data/" + data_type + "/rules/tree_csv/" + file_name + "AVG.csv") dt.arrangeByScore(np.unique(np.asarray(csvs)), final_csv_fn)
def main(): hidden_layer_sizes = [100, 100, 100, 100, 100, 100] file_names = [] data_type = "wines" for f in range(len(hidden_layer_sizes)): #file_names.append("filmsBOWL" + str(f + 1) + "" + str(hidden_layer_sizes[f])) #file_names.append("filmsPPMIDropoutL"+str(f+1)+""+str(hidden_layer_sizes[f])) file_names.append(data_type + "100L" + str(f + 1) + "" + str(hidden_layer_sizes[f])) #init_vector_path= "../data/" + data_type + "/bow/binary/phrases/class-all" #init_vector_path = "../data/" + data_type + "/bow/ppmi/class-all" #init_vector_path="../data/" + data_type + "/nnet/spaces/films200L1100N0.5pavPPMIN0.5FTadagradcategorical_crossentropy100.txt" init_vector_path = "../data/" + data_type + "/nnet/spaces/wines100.txt" end_file_names = [] # Class and vector inputs for i in range(len(file_names)): #These are the parameter values hidden_layer_size = hidden_layer_sizes[i] batch_size = 200 reg = 0.0 noise = 0.5 dropout_noise = None file_name = file_names[i] + "N" + str(noise) hidden_activation = "tanh" output_activation = "tanh" optimizer_name = "sgd" learn_rate = 0.01 epochs = 500 activity_reg = 0 loss = "mse" class_path = None print(file_name) #deep_size = hidden_layer_sizes[i] deep_size = None if deep_size is not None: file_name = file_name + "DL" + str(deep_size) #NN Setup """ SDA = NeuralNetwork( noise=noise, optimizer_name=optimizer_name, batch_size=batch_size, epochs=epochs, dropout_noise=dropout_noise, vector_path=init_vector_path, hidden_layer_size=hidden_layer_size, class_path=class_path, reg=reg, data_type=data_type, hidden_activation=hidden_activation, output_activation=output_activation, learn_rate=learn_rate, file_name=file_name, network_type="da", deep_size=deep_size, activity_reg=activity_reg) """ file_name = "wines100trimmed" vector_path = "../data/" + data_type + "/nnet/spaces/" + file_name + ".txt" init_vector_path = "../data/" + data_type + "/nnet/spaces/" + file_name + ".txt" past_model_weights_fn = [ "../data/" + data_type + "/nnet/weights/L1" + file_name + ".txt" ] past_model_bias_fn = [ "../data/" + data_type + "/nnet/bias/L1" + file_name + ".txt" ] hidden_space_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + ".txt" # Get SVM scores lowest_count = 50 highest_count = 0 #vector_path = "../data/" + data_type + "/nnet/spaces/"+file_name+"L1.txt" class_path = "../data/" + data_type + "/bow/binary/phrases/class-all-" + str( lowest_count) property_names_fn = "../data/" + data_type + "/bow/names/" + str( lowest_count) + ".txt" svm_type = "svm" file_name = file_name + svm_type """ svm.getSVMResults(vector_path, class_path, property_names_fn, file_name, lowest_count=lowest_count, highest_count=highest_count, svm_type=svm_type, get_kappa=True, get_f1=False, single_class=True, data_type=data_type) """ directions_fn = "../data/" + data_type + "/svm/directions/" + file_name + str( lowest_count) + ".txt" # Get rankings vector_names_fn = "../data/" + data_type + "/nnet/spaces/entitynames.txt" class_names_fn = "../data/" + data_type + "/bow/names/" + str( lowest_count) + ".txt" directions_fn = "../data/" + data_type + "/svm/directions/" + file_name + str( lowest_count) + ".txt" #rank.getAllPhraseRankings(directions_fn, vector_path, class_names_fn, vector_names_fn, file_name, data_type=data_type) #ndcg.getNDCG("../data/" + data_type + "/rank/numeric/"+file_name+"ALL.txt",file_name, data_type, lowest_count) scores_fn = "../data/" + data_type + "/ndcg/" + file_name + ".txt" file_name = file_name + "ndcg" kappa = False #scores_fn = "../data/" + data_type + "/svm/kappa/" + file_name + str(lowest_count)+".txt" #file_name = file_name + "kappa" #kappa = True # Get clusters amt_high_directions = hidden_layer_size * 2 amt_low_directions = 13000 amt_of_clusters = hidden_layer_size * 2 #scores_fn = "../data/" + data_type + "/svm/kappa/"+file_name+"200.txt" #file_name = file_name + "similarityclustering" #cluster.getClusters(directions_fn, scores_fn, class_names_fn, False, amt_high_directions, amt_low_directions, file_name, amt_of_clusters) clusters_fn = "../data/" + data_type + "/cluster/clusters/" + file_name + ".txt" property_names_fn = "../data/" + data_type + "/cluster/names/" + file_name + ".txt" percentage_bin = 1 #rank.getAllRankings(clusters_fn, vector_path, property_names_fn, vector_names_fn, 0.2, 1, False, file_name, False, data_type) names_fn = "../data/" + data_type + "/bow/names/" + str( lowest_count) + ".txt" dissimilarity_threshold = 0.5 similarity_threshold = 0.9 cluster_amt = 200 amount_to_start = 8000 score_limit = 0.95 print(file_name) add_all_terms = False file_name = file_name + "not all terms" + str(score_limit) hierarchy.initClustering(vector_path, directions_fn, scores_fn, names_fn, amount_to_start, False, dissimilarity_threshold, cluster_amt, score_limit, file_name, kappa, similarity_threshold, add_all_terms, data_type) # Get rankings clusters_fn = "../data/" + data_type + "/cluster/hierarchy_directions/" + file_name + ".txt" property_names_fn = "../data/" + data_type + "/cluster/hierarchy_names/" + file_name + ".txt" vector_names_fn = "../data/" + data_type + "/nnet/spaces/entitynames.txt" rank.getAllRankings(clusters_fn, vector_path, property_names_fn, vector_names_fn, 0.2, 1, False, file_name, False, data_type) #file_name = "films100previouswork" # Get PAV ranking_fn = "../data/" + data_type + "/rank/numeric/" + file_name + ".txt" #fto.pavPPMI(property_names_fn, ranking_fn, file_name, data_type) #fto.pavTermFrequency(ranking_fn, cluster_names_fn, file_name, False) #fto.binaryClusterTerm(cluster_names_fn, file_name) #fto.binaryInCluster(property_names_fn, file_name) discrete_labels_fn = "../data/" + data_type + "/rank/discrete/" + file_name + "P1.txt" # Use PAV as class vectors fine_tune_weights_fn = [clusters_fn] epochs = 2000 batch_size = 200 learn_rate = 0.001 is_identity = True identity_swap = False randomize_finetune_weights = False corrupt_finetune_weights = False from_ae = True #from_ae = False finetune_size = 200 fn = file_name # Running Finetune on original space file_name = file_name + "pavPPMI" class_path = "../data/" + data_type + "/finetune/" + file_name + ".txt" if randomize_finetune_weights: fine_tune_weights_fn = None file_name = file_name + "N" + str(noise) + "FTR" elif corrupt_finetune_weights: file_name = file_name + "N" + str(noise) + "FTC" else: file_name = file_name + "N" + str(noise) + "FT" if is_identity: file_name = file_name + "IT" if identity_swap: file_name = file_name + "ITS" file_name = file_name + "ITS" print(file_name) loss = "mse" optimizer_name = "sgd" hidden_activation = "tanh" finetune_activation = "linear" file_name = file_name + optimizer_name + loss + str(epochs) print(file_name) amount_of_finetune = 1 """ SDA = NeuralNetwork( noise=0, fine_tune_weights_fn=fine_tune_weights_fn, optimizer_name=optimizer_name, network_type="ft", past_model_bias_fn=past_model_bias_fn, randomize_finetune_weights=randomize_finetune_weights, vector_path=init_vector_path, hidden_layer_size=hidden_layer_size, class_path=class_path, amount_of_finetune=amount_of_finetune, identity_swap=identity_swap, hidden_activation=hidden_activation, output_activation=output_activation, epochs=epochs, learn_rate=learn_rate, is_identity=is_identity, finetune_activation=finetune_activation, batch_size=batch_size, past_model_weights_fn = past_model_weights_fn, loss=loss, file_name=file_name, from_ae=from_ae, finetune_size=finetune_size, data_type=data_type) """ init_vector_path = "../data/" + data_type + "/nnet/spaces/" + file_name + "L1.txt" # Get SVM scores lowest_count = 200 highest_count = 10000 vector_path = "../data/" + data_type + "/nnet/spaces/" + file_name + "L1.txt" class_path = "../data/" + data_type + "/bow/binary/phrases/class-all-" + str( lowest_count) property_names_fn = "../data/" + data_type + "/bow/names/" + str( lowest_count) + ".txt" svm_type = "svm" file_name = file_name + svm_type #svm.getSVMResults(vector_path, class_path, property_names_fn, file_name, lowest_count=lowest_count, highest_count=highest_count, svm_type=svm_type, get_kappa=False, get_f1=False) # Get rankings vector_names_fn = "../data/" + data_type + "/nnet/spaces/entitynames.txt" class_names_fn = "../data/" + data_type + "/bow/names/" + str( lowest_count) + ".txt" directions_fn = "../data/" + data_type + "/svm/directions/" + file_name + str( lowest_count) + ".txt" #rank.getAllPhraseRankings(directions_fn, vector_path, property_names_fn, vector_names_fn, file_name) # file_name = file_name + "ndcg" #ndcg.getNDCG("../data/" + data_type + "/rank/numeric/"+file_name+"ALL.txt",file_name) names_fn = "../data/" + data_type + "/bow/names/" + str( lowest_count) + ".txt" similarity_threshold = 0.5 cluster_amt = 200 amount_to_start = 8000 score_limit = 0.9 print(file_name) #hierarchy.initClustering(vector_path, directions_fn, scores_fn, names_fn, amount_to_start, False, similarity_threshold, cluster_amt, score_limit, file_name, kappa) """ scores_fn = "../data/" + data_type + "/svm/kappa/" + file_name + "200.txt" file_name = file_name + "kappa" kappa = True hierarchy.initClustering(vector_path, directions_fn, scores_fn, names_fn, amount_to_start, False, similarity_threshold, cluster_amt, score_limit, file_name, kappa) """ # Get rankings clusters_fn = "../data/" + data_type + "/cluster/hierarchy_directions/" + file_name + str( score_limit) + str(cluster_amt) + ".txt" property_names_fn = "../data/" + data_type + "/cluster/hierarchy_names/" + file_name + str( score_limit) + str(cluster_amt) + ".txt" vector_names_fn = "../data/" + data_type + "/nnet/spaces/entitynames.txt" #rank.getAllRankings(clusters_fn, vector_path, property_names_fn, vector_names_fn, 0.2, 1, False, file_name, False) cluster_to_classify = -1 max_depth = 50 label_names_fn = "../data/" + data_type + "/classify/keywords/names.txt" cluster_labels_fn = "../data/" + data_type + "/classify/keywords/class-All" cluster_names_fn = "../data/" + data_type + "/cluster/hierarchy_names/" + fn + str( score_limit) + ".txt" #clf = tree.DecisionTree(clusters_fn, cluster_labels_fn, label_names_fn, cluster_names_fn, file_name, 10000, max_depth) fn_to_place = "films100L3100N0.5" score_limit = 0.8 cluster_amt = 200 property_names_fn = "../data/" + data_type + "/cluster/hierarchy_names/" + fn_to_place + str( score_limit) + str(cluster_amt) + ".txt" ranking_fn = "../data/" + data_type + "/rank/numeric/" + fn_to_place + ".txt" #fto.pavPPMI(property_names_fn, ranking_fn, fn_to_place) end_file_names = [ "L1films100L3100N0.5InClusterN0.5FTadagradcategorical_crossentropy100Genres100L3", "L2films100L3100N0.5InClusterN0.5FTadagradcategorical_crossentropy100Genres100L3", "L3films100L3100N0.5InClusterN0.5FTadagradcategorical_crossentropy100Genres100L3" ] init_vector_path = "../data/" + data_type + "/nnet/spaces/films100.txt" past_model_weights_fn = [] past_model_bias_fn = [] for f in end_file_names: past_model_weights_fn.append("../data/" + data_type + "/nnet/weights/" + f + ".txt") past_model_bias_fn.append("../data/" + data_type + "/nnet/bias/" + f + ".txt") class_path = "../data/" + data_type + "/classify/genres/class-all" loss = "binary_crossentropy" output_activation = "sigmoid" optimizer_name = "adagrad" hidden_activation = "tanh" learn_rate = 0.01 fine_tune_weights_fn = None randomize_finetune_weights = False epochs = 100 batch_size = 200 hidden_layer_size = 400 is_identity = False dropout_noise = None from_ae = True identity_swap = False file_name = end_file_names[len(end_file_names) - 1] """ score_limit = 0.8 cluster_amt = 400 clusters_fn = "../data/" + data_type + "/cluster/hierarchy_directions/" + fn_to_place + str(score_limit) + str( cluster_amt) + ".txt" fine_tune_weights_fn = [clusters_fn] randomize_finetune_weights = False class_path ="../data/" + data_type + "/finetune/" + fn_to_place + "pavPPMI.txt" loss = "mse" output_activation = "linear" batch_size = 200 hidden_layer_size = 100 epochs = 250 file_name = file_name + "Genres" + str(epochs) + "L" + str(len(end_file_names)) """ """ deep_size = 400 epochs = 299 from_ae = False past_model_weights_fn = None past_model_bias_fn = None fine_tune_weights_fn = None is_identity = True amount_of_finetune = 5 randomize_finetune_weights = True file_name = "films100" finetune_size = cluster_amt init_vector_path = "../data/" + data_type + "/rank/numeric/"+file_name+".txt" file_name = file_name + "rank" + "E" + str(epochs) + "DS" + str(deep_size) + "L" + str(amount_of_finetune) SDA = NeuralNetwork(noise=0, fine_tune_weights_fn=fine_tune_weights_fn, optimizer_name=optimizer_name, network_type="ft", past_model_bias_fn=past_model_bias_fn, deep_size=deep_size, randomize_finetune_weights=randomize_finetune_weights, amount_of_finetune=amount_of_finetune, vector_path=init_vector_path, hidden_layer_size=hidden_layer_size, class_path=class_path, identity_swap=identity_swap, dropout_noise=dropout_noise, hidden_activation=hidden_activation, output_activation=output_activation, epochs=epochs, learn_rate=learn_rate, is_identity=is_identity, finetune_size = finetune_size, batch_size=batch_size, past_model_weights_fn=past_model_weights_fn, loss=loss, file_name=file_name, from_ae=from_ae) """ deep_size = 400 epochs = 299 from_ae = True #past_model_weights_fn = None #past_model_bias_fn = None #file_name = "films100" fine_tune_weights_fn = None is_identity = False amount_of_finetune = 0 randomize_finetune_weights = False #file_name = end_file_names[len(end_file_names)-1] #init_vector_path = "../data/" + data_type + "/nnet/spaces/films100.txt" score_limit = 0.9 cluster_amt = 400 output_size = 23 hidden_layer_size = 100 epochs = 200 class_outputs = True optimizer_name = "adagrad" learn_rate = 0.01 output_activation = "sigmoid" finetune_activation = "linear" hidden_activation = "tanh" finetune_size = cluster_amt file_name = "films100" original_fn = file_name init_vector_path = "../data/" + data_type + "/rank/numeric/" + file_name + "svmndcg0.9" + str( cluster_amt) + ".txt" clusters_fn = "../data/" + data_type + "/cluster/hierarchy_directions/" + file_name + "svmndcg0.9" + str( cluster_amt) + ".txt" deep_size = [100, 100, 100] fine_tune_weights_fn = [clusters_fn] fine_tune_weights_fn = "" class_path = "../data/" + data_type + "/classify/genres/class-All" from_ae = False file_name = file_name + "rank" + "E" + str(epochs) + "DS" + str( deep_size) + "L" + str(len(deep_size)) + str(cluster_amt) """ SDA = NeuralNetwork(noise=0, fine_tune_weights_fn=fine_tune_weights_fn, optimizer_name=optimizer_name, network_type="ft", past_model_bias_fn=past_model_bias_fn, deep_size=deep_size, finetune_activation=finetune_activation, randomize_finetune_weights=randomize_finetune_weights, amount_of_finetune=amount_of_finetune, vector_path=init_vector_path, hidden_layer_size=hidden_layer_size, class_path=class_path, identity_swap=identity_swap, dropout_noise=dropout_noise, class_outputs=class_outputs, hidden_activation=hidden_activation, output_activation=output_activation, epochs=epochs, learn_rate=learn_rate, is_identity=is_identity, output_size=output_size, finetune_size=finetune_size, batch_size=batch_size, past_model_weights_fn=past_model_weights_fn, loss=loss, file_name=file_name, from_ae=from_ae) """ data_type = "wines" classification_task = "types" file_name = "wines100trimmed" init_vector_path = "../data/" + data_type + "/nnet/spaces/" + file_name + ".txt" #file_name = "winesppmi" #init_vector_path = "../data/wines/bow/ppmi/class-trimmed-all-50" deep_size = [100, 100, 100] for d in range(len(deep_size)): print(deep_size, init_vector_path) loss = "binary_crossentropy" output_activation = "sigmoid" optimizer_name = "adagrad" hidden_activation = "tanh" classification_path = "../data/" + data_type + "/classify/" + classification_task + "/class-all" learn_rate = 0.01 fine_tune_weights_fn = None epochs = 500 batch_size = 200 class_outputs = True dropout_noise = 0.3 is_identity = False identity_swap = False randomize_finetune_weights = False hidden_layer_size = 100 output_size = 10 randomize_finetune_weights = False corrupt_finetune_weights = False fine_tune_weights_fn = [] #init_vector_path = "../data/" + data_type + "/movies/bow/binary/phrases/class-all" if d == 0: file_name = file_name + "rank" + "E" + str(epochs) + "DS" + str(deep_size) + "L" + str(amount_of_finetune)\ + "DN" + str(dropout_noise) + hidden_activation + "SFT" + str(d) else: file_name = file_name + "SFT" + str(d) print("!!!!!!!!!!!!!!!", deep_size) SDA = NeuralNetwork( noise=0, fine_tune_weights_fn=fine_tune_weights_fn, optimizer_name=optimizer_name, network_type="ft", past_model_bias_fn=past_model_bias_fn, deep_size=deep_size, finetune_activation=finetune_activation, randomize_finetune_weights=randomize_finetune_weights, amount_of_finetune=amount_of_finetune, vector_path=init_vector_path, hidden_layer_size=hidden_layer_size, class_path=classification_path, identity_swap=identity_swap, dropout_noise=dropout_noise, class_outputs=class_outputs, hidden_activation=hidden_activation, output_activation=output_activation, epochs=epochs, learn_rate=learn_rate, is_identity=is_identity, output_size=output_size, finetune_size=finetune_size, batch_size=batch_size, past_model_weights_fn=past_model_weights_fn, loss=loss, file_name=file_name, from_ae=from_ae, data_type=data_type) new_file_names = [] if dropout_noise is not None and dropout_noise > 0.0: for j in range(0, len(deep_size) * 2 + 1, 2): new_fn = file_name + "L" + str(j) new_file_names.append(new_fn) else: for j in range(0, len(deep_size) + 1): new_fn = file_name + "L" + str(j) new_file_names.append(new_fn) for j in range(len(new_file_names)): #file_name = "wines100trimmed" #file_name = "films100rankE200DS[100, 100, 100]L3300L1svmndcg0.9200pavPPMIN0.5FTITsgdmse2000L1rankE100DS[100, 100]L0" file_name = new_file_names[j] past_model_weights_fn = [ "../data/" + data_type + "/nnet/weights/" + file_name + ".txt" ] past_model_bias_fn = [ "../data/" + data_type + "/nnet/bias/" + file_name + ".txt" ] # Get SVM scores if data_type is "wines" or "placetypes": lowest_count = 50 else: lowest_count = 200 highest_count = 10000 vector_path = "../data/" + data_type + "/nnet/spaces/" + file_name + ".txt" class_path = "../data/" + data_type + "/bow/binary/phrases/class-all-" + str( lowest_count) property_names_fn = "../data/" + data_type + "/bow/names/" + str( lowest_count) + ".txt" svm_type = "svm" threads = 4 file_name = file_name + svm_type svm.getSVMResults(vector_path, class_path, property_names_fn, file_name, lowest_count=lowest_count, highest_count=highest_count, svm_type=svm_type, data_type=data_type, get_kappa=True, get_f1=False, getting_directions=True, threads=4) directions_fn = "../data/" + data_type + "/svm/directions/" + file_name + str( lowest_count) + ".txt" # Get rankings vector_names_fn = "../data/" + data_type + "/nnet/spaces/entitynames.txt" class_names_fn = "../data/" + data_type + "/bow/names/" + str( lowest_count) + ".txt" directions_fn = "../data/" + data_type + "/svm/directions/" + file_name + str( lowest_count) + ".txt" """ scores_fn = "../data/" + data_type + "/svm/kappa/" + file_name + str(lowest_count) + ".txt" kappa = True if d == 0: file_name = file_name + "kappa" """ rank.getAllPhraseRankings(directions_fn, vector_path, class_names_fn, vector_names_fn, file_name, data_type=data_type) ndcg.getNDCG("../data/" + data_type + "/rank/numeric/" + file_name + "ALL.txt", file_name, data_type=data_type, lowest_count=lowest_count) scores_fn = "../data/" + data_type + "/ndcg/" + file_name + ".txt" kappa = False if d == 0: file_name = file_name + "ndcg" names_fn = "../data/" + data_type + "/bow/names/" + str( lowest_count) + ".txt" similarity_threshold = 0.5 cluster_amt = deep_size[j] * 2 amount_to_start = 8000 score_limit = 0.9 dissimilarity_threshold = 0.9 file_name = file_name + str(score_limit) + str(cluster_amt) hierarchy.initClustering(vector_path, directions_fn, scores_fn, names_fn, amount_to_start, False, similarity_threshold, cluster_amt, score_limit, file_name, kappa, dissimilarity_threshold, data_type=data_type) # Get rankings clusters_fn = "../data/" + data_type + "/cluster/hierarchy_directions/" + file_name + ".txt" property_names_fn = "../data/" + data_type + "/cluster/hierarchy_names/" + file_name + ".txt" vector_names_fn = "../data/" + data_type + "/nnet/spaces/entitynames.txt" rank.getAllRankings(clusters_fn, vector_path, property_names_fn, vector_names_fn, 0.2, 1, False, file_name, False, data_type=data_type) # Get PAV ranking_fn = "../data/" + data_type + "/rank/numeric/" + file_name + ".txt" label_names_fn = "../data/" + data_type + "/classify/" + classification_task + "/names.txt" tree.DecisionTree(ranking_fn, classification_path, label_names_fn, property_names_fn, file_name, 10000, 3, balance="balanced", criterion="entropy", save_details=False, data_type=data_type) tree.DecisionTree(ranking_fn, classification_path, label_names_fn, property_names_fn, file_name, 10000, None, balance="balanced", criterion="entropy", save_details=False, data_type=data_type) if d == 0: file_name = file_name + "pavPPMI" fto.pavPPMI(property_names_fn, ranking_fn, file_name, data_type=data_type) discrete_labels_fn = "../data/" + data_type + "/rank/discrete/" + file_name + "P1.txt" # Use PAV as class vectors fine_tune_weights_fn = [clusters_fn] epochs = 1000 batch_size = 200 learn_rate = 0.001 is_identity = True identity_swap = False randomize_finetune_weights = False # from_ae = False finetune_size = cluster_amt fn = file_name # Running Finetune on original space class_path = "../data/" + data_type + "/finetune/" + file_name + ".txt" if d == 0: file_name = file_name + "IT" print(file_name) loss = "mse" optimizer_name = "sgd" hidden_activation = "tanh" finetune_activation = "linear" hidden_layer_size = deep_size[j] if d == 0: file_name = file_name + optimizer_name + loss + str(epochs) from_ae = True past_model_weights_fn = [ "../data/" + data_type + "/nnet/weights/L" + new_file_names[j] + ".txt" ] past_model_bias_fn = [ "../data/" + data_type + "/nnet/bias/L" + new_file_names[j] + ".txt" ] print(file_name) amount_of_finetune = 1 SDA = NeuralNetwork( noise=0, fine_tune_weights_fn=fine_tune_weights_fn, optimizer_name=optimizer_name, network_type="ft", past_model_bias_fn=past_model_bias_fn, randomize_finetune_weights=randomize_finetune_weights, vector_path=init_vector_path, hidden_layer_size=hidden_layer_size, class_path=class_path, identity_swap=identity_swap, amount_of_finetune=amount_of_finetune, hidden_activation=hidden_activation, output_activation=output_activation, epochs=epochs, learn_rate=learn_rate, is_identity=is_identity, finetune_activation=finetune_activation, batch_size=batch_size, past_model_weights_fn=past_model_weights_fn, loss=loss, file_name=file_name, from_ae=from_ae, finetune_size=finetune_size, data_type=data_type) new_file_names[j - 1] = file_name ranking_fn = "../data/" + data_type + "/nnet/clusters/" + file_name + ".txt" tree.DecisionTree(ranking_fn, classification_path, label_names_fn, property_names_fn, file_name, 10000, 3, balance="balanced", criterion="entropy", save_details=False, data_type=data_type) tree.DecisionTree(ranking_fn, classification_path, label_names_fn, property_names_fn, file_name, 10000, None, balance="balanced", criterion="entropy", save_details=False, data_type=data_type) """ file_name ="films100rankE200DS[100, 100, 100]L3300L1svmndcg0.9200pavPPMIN0.5FTITsgdmse2000L1" loss = "binary_crossentropy" output_activation = "sigmoid" optimizer_name = "adagrad" hidden_activation = "tanh" class_path = "../data/" + data_type + "/classify/genres/class-all" learn_rate = 0.01 fine_tune_weights_fn = None epochs = 100 batch_size = 200 class_outputs = True dropout_noise = None deep_size = [100, 100] hidden_layer_size = 100 output_size = 23 randomize_finetune_weights = False corrupt_finetune_weights = False fine_tune_weights_fn = [] init_vector_path = "../data/" + data_type + "/nnet/clusters/" + file_name + ".txt" file_name = file_name + "rank" + "E" + str(epochs) + "DS" + str(deep_size) + "L" + str(amount_of_finetune) SDA = NeuralNetwork(noise=0, fine_tune_weights_fn=fine_tune_weights_fn, optimizer_name=optimizer_name, network_type="ft", past_model_bias_fn=past_model_bias_fn, deep_size=deep_size, randomize_finetune_weights=randomize_finetune_weights, output_size=output_size, amount_of_finetune=amount_of_finetune, class_outputs=class_outputs, vector_path=init_vector_path, hidden_layer_size=hidden_layer_size, class_path=class_path, identity_swap=identity_swap, dropout_noise=dropout_noise, hidden_activation=hidden_activation, output_activation=output_activation, epochs=epochs, learn_rate=learn_rate, is_identity=is_identity, finetune_size=finetune_size, batch_size=batch_size, past_model_weights_fn=past_model_weights_fn, loss=loss, file_name=file_name, from_ae=from_ae) """ file_name = new_file_names[0] init_vector_path = "../data/" + data_type + "/nnet/spaces/" + file_name + "L0.txt" deep_size = deep_size[:len(deep_size) - 1]