def __init__(self, directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, fn, percent, percentage_increment, by_vector): directions = dt.importVectors(directions_fn) vectors = dt.importVectors(vectors_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(vector_names_fn) rankings = self.getRankings(directions, vectors, cluster_names, vector_names) rankings = np.array(rankings) #labels = self.createLabels(rankings, percent) #labels = np.asarray(labels) discrete_labels = self.createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) if by_vector: #labels = labels.transpose() discrete_labels = discrete_labels.transpose() rankings = rankings.transpose() #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels") dt.write2dArray(rankings, "Rankings/" + fn + ".space") dt.write2dArray( discrete_labels, "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete") array = [] short_array = [] """ Disabled names for quick view now
def __init__(self, epochs=1, learn_rate=0.01, loss="mse", batch_size=1, decay=1e-06, hidden_activation="tanh", layer_init="glorot_uniform", output_activation="tanh", hidden_layer_size=100, file_name="unspecified_filename", vector_path=None, reg=0, optimizer_name="rmsprop", class_names=None, noise=0, output_weights=None): # Initialize the model self.model = Sequential() # Import the numpy vectors try: movie_vectors = np.asarray(np.load(vector_path)) except OSError: # If it fails, assume that it's in a standard format for vectors and then save it in numpy format movie_vectors = dt.importVectors(vector_path) movie_vectors = np.asarray(movie_vectors) np.save(file_name, movie_vectors) # Set the input and the output to be the same size, as this is an auto-encoder input_size = len(movie_vectors[0]) output_size = len(movie_vectors[0]) if noise > 0: # If using a noisy autoencoder, add GaussianNoise layers to the start of the encoder self.model.add(GaussianNoise(noise, input_shape=(input_size,))) self.model.add(Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation,W_regularizer=l2(reg))) else: # Otherwise just add the hidden layer self.model.add(Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation,W_regularizer=l2(reg))) # If using custom weights on the hidden layer to the output layer, apply those custom weights. Otherwise just add output layer. if output_weights == None: self.model.add(Dense(output_dim=output_size, init=layer_init, activation=output_activation)) else: self.model.add(Dense(output_dim=len(output_weights[0]), init=layer_init, activation=output_activation, weights=output_weights)) # Compile the model and fit it to the data if optimizer_name == "sgd": optimizer = SGD(lr=learn_rate, decay=decay) elif optimizer_name == "rmsprop": optimizer = RMSprop(lr=learn_rate) self.model.compile(loss=loss, optimizer=optimizer) self.model.fit(movie_vectors, movie_vectors, nb_epoch=epochs, batch_size=batch_size, verbose=1) # Create a truncated model that has no output layer that has the same weights as the previous model and use it to obtain the hidden layer representation truncated_model = Sequential() total_file_name = "newdata/spaces/" + file_name +".mds" truncated_model.add(GaussianNoise(noise, input_shape=(input_size,))) truncated_model.add(Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation, W_regularizer=l2(reg))) truncated_model.compile(loss=loss, optimizer=optimizer) self.end_space = truncated_model.predict(movie_vectors) np.save(self.end_space, total_file_name)
def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn, label_names_fn, cluster_names_fn, filename, training_data, cluster_to_classify, max_depth): vectors = dt.importVectors(cluster_vectors_fn) labels = dt.importLabels(cluster_labels_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(movie_names_fn) label_names = dt.importString(label_names_fn) scores_array = [] for l in range(len(labels[0])): new_labels = [0] * 15000 for x in range(len(labels)): new_labels[x] = labels[x][l] x_train = np.asarray(vectors[:training_data]) x_test = np.asarray(vectors[training_data:]) y_train = np.asarray(new_labels[:training_data]) y_test = np.asarray(new_labels[training_data:]) self.clf = tree.DecisionTreeClassifier(max_depth=max_depth) self.clf = self.clf.fit(x_train, y_train) y_pred = self.clf.predict(x_test) f1 = f1_score(y_test, y_pred, average='binary') accuracy = accuracy_score(y_test, y_pred) scores = [[label_names[l], "f1", f1, "accuracy", accuracy]] print scores[0] scores_array.append(scores) class_names = [label_names[l], "NOT " + label_names[l]] tree.export_graphviz(self.clf, feature_names=cluster_names, class_names=class_names, out_file='Rules/' + label_names[l] + filename + '.dot', max_depth=10) """ rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot') new_dot_file = [] for s in rewrite_dot_file: new_string = s if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s: index = s.index("value") new_string = s[:index] + '"] ;' new_dot_file.append(new_string) dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot') """ graph = pydot.graph_from_dot_file('Rules/' + label_names[l] + filename + '.dot') graph.write_png('Rules/Images/' + label_names[l] + filename + ".png") self.get_code(self.clf, cluster_names, class_names, label_names[l] + filename) dt.write1dArray(scores_array, 'Rules/Scores/' + filename + '.scores')
def __init__(self, directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, fn, percent, percentage_increment, by_vector): directions = dt.importVectors(directions_fn) vectors = dt.importVectors(vectors_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(vector_names_fn) rankings = self.getRankings(directions, vectors, cluster_names, vector_names) rankings = np.array(rankings) #labels = self.createLabels(rankings, percent) #labels = np.asarray(labels) discrete_labels = self.createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) if by_vector: #labels = labels.transpose() discrete_labels = discrete_labels.transpose() rankings = rankings.transpose() #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels") dt.write2dArray(rankings, "Rankings/" + fn + ".space") dt.write2dArray(discrete_labels, "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete") array = [] short_array = [] """ Disabled names for quick view now
def getKNeighbors(vector_path="filmdata/films200.mds/films200.mds", class_path="filmdata/classesGenres/class-All", n_neighbors=1, algorithm="kd_tree", leaf_size=30, training_data=10000, name="normal200"): movie_vectors = np.asarray(dt.importVectors(vector_path)) movie_labels = np.asarray(dt.importLabels(class_path)) x_train, y_train, x_test, y_test = dt.splitData(training_data, movie_vectors, movie_labels) classifier = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size) classifier.fit(x_train, y_train.ravel()) y_pred = classifier.predict(x_test) f1 = f1_score(y_test, y_pred, average='macro') accuracy = accuracy_score(y_test, y_pred) dt.write1dArray([f1, accuracy], "KNNScores/" + name + ".score") print "F1 " + str(f1), "Accuracy", accuracy
def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn, label_names_fn, cluster_names_fn, filename, training_data, cluster_to_classify, max_depth): vectors = dt.importVectors(cluster_vectors_fn) labels = dt.importLabels(cluster_labels_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(movie_names_fn) label_names = dt.importString(label_names_fn) scores_array = [] for l in range(len(labels[0])): new_labels = [0] * 15000 for x in range(len(labels)): new_labels[x] = labels[x][l] x_train = np.asarray(vectors[:training_data]) x_test = np.asarray(vectors[training_data:]) y_train = np.asarray(new_labels[:training_data]) y_test = np.asarray(new_labels[training_data:]) self.clf = tree.DecisionTreeClassifier( max_depth=max_depth) self.clf = self.clf.fit(x_train, y_train) y_pred = self.clf.predict(x_test) f1 = f1_score(y_test, y_pred, average='binary') accuracy = accuracy_score(y_test, y_pred) scores = [[label_names[l], "f1", f1, "accuracy", accuracy]] print scores[0] scores_array.append(scores) class_names = [ label_names[l], "NOT "+label_names[l]] tree.export_graphviz(self.clf, feature_names=cluster_names, class_names=class_names, out_file='Rules/'+label_names[l]+filename+'.dot', max_depth=10) """ rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot') new_dot_file = [] for s in rewrite_dot_file: new_string = s if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s: index = s.index("value") new_string = s[:index] + '"] ;' new_dot_file.append(new_string) dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot') """ graph = pydot.graph_from_dot_file('Rules/'+label_names[l]+filename+'.dot') graph.write_png('Rules/Images/'+label_names[l]+filename+".png") self.get_code(self.clf, cluster_names, class_names, label_names[l]+filename) dt.write1dArray(scores_array, 'Rules/Scores/'+filename+'.scores')
def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold): directions = dt.importVectors(directions_fn) scores = dt.importString(scores_fn) names = dt.importString(names_fn) for s in range(len(scores)): scores[s] = float(scores[s].strip()) high_direction_indexes = [] high_direction_scores = [] low_direction_indexes = [] low_direction_scores = [] for s in range(len(scores)): if scores[s] >= high_threshold: high_direction_indexes.append(s) high_direction_scores.append(scores[s]) elif scores[s] >= low_threshold: low_direction_indexes.append(s) low_direction_scores.append(scores[s]) sorted_h_indexes = dt.sortByArray(high_direction_indexes, high_direction_scores) sorted_l_indexes = dt.sortByArray(low_direction_indexes, low_direction_scores) sorted_h_indexes.reverse() sorted_l_indexes.reverse() high_direction_names = [] low_direction_names = [] high_directions = [] low_directions = [] for s in sorted_h_indexes: high_directions.append(directions[s]) high_direction_names.append(names[s][6:]) for s in sorted_l_indexes: low_directions.append(directions[s]) low_direction_names.append(names[s][6:]) return high_direction_names, low_direction_names, high_directions, low_directions
def __init__(self, name_distinction="", class_names=None, vector_path=None, class_path=None, class_by_class=True, input_size=200, training_data=10000, amount_of_scores=400, low_kappa=0.1, high_kappa=0.5, rankSVM=False, amount_to_cut_at=100, largest_cut=21470000): print "getting movie data" movie_vectors = dt.importVectors(vector_path) movie_labels = dt.importLabels(class_path) print "getting file names" file_names = dt.getFns(class_path[:-10]) print len(movie_labels), len(movie_labels[0]) print "getting training and test data" x_train = np.asarray(movie_vectors[:training_data]) x_test = np.asarray(movie_vectors[training_data:]) movie_labels = zip(*movie_labels) file_names, movie_labels = self.getSampledData(file_names, movie_labels, amount_to_cut_at, largest_cut) movie_labels = zip(*movie_labels) y_train = movie_labels[:training_data] y_test = movie_labels[training_data:] y_train = np.asarray(zip(*y_train)) y_test = np.asarray(zip(*y_test)) print len(y_train), len(y_test), training_data print "getting kappa scores" kappa_scores, directions = self.runAllSVMs(y_test, y_train, x_train, x_test, file_names) dt.write1dArray(kappa_scores, "SVMResults/"+name_distinction+".scores") dt.write1dArray(file_names, "SVMResults/"+name_distinction+".names") dt.write2dArray(directions, "directions/"+name_distinction+".directions")
def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold): directions = dt.importVectors(directions_fn) scores = dt.importString(scores_fn) names = dt.importString(names_fn) for s in range(len(scores)): scores[s] = float(scores[s].strip()) high_direction_indexes = [] high_direction_scores = [] low_direction_indexes = [] low_direction_scores = [] for s in range(len(scores)): if scores[s] >= high_threshold: high_direction_indexes.append(s) high_direction_scores.append(scores[s]) elif scores[s] >= low_threshold: low_direction_indexes.append(s) low_direction_scores.append(scores[s]) sorted_h_indexes = dt.sortByArray(high_direction_indexes, high_direction_scores) sorted_l_indexes = dt.sortByArray(low_direction_indexes , low_direction_scores) sorted_h_indexes.reverse() sorted_l_indexes.reverse() high_direction_names = [] low_direction_names = [] high_directions = [] low_directions = [] for s in sorted_h_indexes: high_directions.append(directions[s]) high_direction_names.append(names[s][6:]) for s in sorted_l_indexes: low_directions.append(directions[s]) low_direction_names.append(names[s][6:]) return high_direction_names, low_direction_names, high_directions, low_directions
def __init__(self, epochs=1, learn_rate=0.01, loss="mse", batch_size=1, decay=1e-06, hidden_activation="tanh", layer_init="glorot_uniform", output_activation="tanh", hidden_layer_size=100, file_name="unspecified_filename", vector_path=None, reg=0, optimizer_name="rmsprop", class_names=None, noise=0, output_weights=None): # Initialize the model self.model = Sequential() # Import the numpy vectors try: movie_vectors = np.asarray(np.load(vector_path)) except OSError: # If it fails, assume that it's in a standard format for vectors and then save it in numpy format movie_vectors = dt.importVectors(vector_path) movie_vectors = np.asarray(movie_vectors) np.save(file_name, movie_vectors) # Set the input and the output to be the same size, as this is an auto-encoder input_size = len(movie_vectors[0]) output_size = len(movie_vectors[0]) if noise > 0: # If using a noisy autoencoder, add GaussianNoise layers to the start of the encoder self.model.add(GaussianNoise(noise, input_shape=(input_size, ))) self.model.add( Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation, W_regularizer=l2(reg))) else: # Otherwise just add the hidden layer self.model.add( Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation, W_regularizer=l2(reg))) # If using custom weights on the hidden layer to the output layer, apply those custom weights. Otherwise just add output layer. if output_weights == None: self.model.add( Dense(output_dim=output_size, init=layer_init, activation=output_activation)) else: self.model.add( Dense(output_dim=len(output_weights[0]), init=layer_init, activation=output_activation, weights=output_weights)) # Compile the model and fit it to the data if optimizer_name == "sgd": optimizer = SGD(lr=learn_rate, decay=decay) elif optimizer_name == "rmsprop": optimizer = RMSprop(lr=learn_rate) self.model.compile(loss=loss, optimizer=optimizer) self.model.fit(movie_vectors, movie_vectors, nb_epoch=epochs, batch_size=batch_size, verbose=1) # Create a truncated model that has no output layer that has the same weights as the previous model and use it to obtain the hidden layer representation truncated_model = Sequential() total_file_name = "newdata/spaces/" + file_name + ".mds" truncated_model.add(GaussianNoise(noise, input_shape=(input_size, ))) truncated_model.add( Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation, W_regularizer=l2(reg))) truncated_model.compile(loss=loss, optimizer=optimizer) self.end_space = truncated_model.predict(movie_vectors) np.save(self.end_space, total_file_name)