예제 #1
0
    def __init__(self, directions_fn, vectors_fn, cluster_names_fn,
                 vector_names_fn, fn, percent, percentage_increment,
                 by_vector):

        directions = dt.importVectors(directions_fn)
        vectors = dt.importVectors(vectors_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(vector_names_fn)

        rankings = self.getRankings(directions, vectors, cluster_names,
                                    vector_names)
        rankings = np.array(rankings)
        #labels = self.createLabels(rankings, percent)
        #labels = np.asarray(labels)
        discrete_labels = self.createDiscreteLabels(rankings,
                                                    percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
        if by_vector:
            #labels = labels.transpose()
            discrete_labels = discrete_labels.transpose()
            rankings = rankings.transpose()
        #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels")
        dt.write2dArray(rankings, "Rankings/" + fn + ".space")
        dt.write2dArray(
            discrete_labels,
            "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete")
        array = []
        short_array = []
        """ Disabled names for quick view now
예제 #2
0
    def __init__(self,
                 epochs=1, learn_rate=0.01, loss="mse", batch_size=1, decay=1e-06,
                 hidden_activation="tanh", layer_init="glorot_uniform", output_activation="tanh",  hidden_layer_size=100,
                 file_name="unspecified_filename", vector_path=None, reg=0,
                 optimizer_name="rmsprop", class_names=None, noise=0, output_weights=None):

        # Initialize the model

        self.model = Sequential()

        # Import the numpy vectors
        try:
            movie_vectors = np.asarray(np.load(vector_path))
        except OSError:
            # If it fails, assume that it's in a standard format for vectors and then save it in numpy format
            movie_vectors = dt.importVectors(vector_path)
            movie_vectors = np.asarray(movie_vectors)
            np.save(file_name, movie_vectors)

        # Set the input and the output to be the same size, as this is an auto-encoder

        input_size = len(movie_vectors[0])
        output_size = len(movie_vectors[0])

        if noise > 0: # If using a noisy autoencoder, add GaussianNoise layers to the start of the encoder
            self.model.add(GaussianNoise(noise, input_shape=(input_size,)))
            self.model.add(Dense(output_dim=hidden_layer_size,  input_dim=input_size, init=layer_init, activation=hidden_activation,W_regularizer=l2(reg)))
        else:
            # Otherwise just add the hidden layer
            self.model.add(Dense(output_dim=hidden_layer_size,  input_dim=input_size, init=layer_init, activation=hidden_activation,W_regularizer=l2(reg)))

        # If using custom weights on the hidden layer to the output layer, apply those custom weights. Otherwise just add output layer.
        if output_weights == None:
            self.model.add(Dense(output_dim=output_size, init=layer_init, activation=output_activation))
        else:
            self.model.add(Dense(output_dim=len(output_weights[0]), init=layer_init, activation=output_activation, weights=output_weights))

        # Compile the model and fit it to the data
        if optimizer_name == "sgd":
            optimizer = SGD(lr=learn_rate, decay=decay)
        elif optimizer_name == "rmsprop":
            optimizer = RMSprop(lr=learn_rate)
        self.model.compile(loss=loss, optimizer=optimizer)
        self.model.fit(movie_vectors, movie_vectors, nb_epoch=epochs, batch_size=batch_size, verbose=1)

        # Create a truncated model that has no output layer that has the same weights as the previous model and use it to obtain the hidden layer representation
        truncated_model = Sequential()
        total_file_name = "newdata/spaces/" + file_name +".mds"
        truncated_model.add(GaussianNoise(noise, input_shape=(input_size,)))
        truncated_model.add(Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation, W_regularizer=l2(reg)))
        truncated_model.compile(loss=loss, optimizer=optimizer)
        self.end_space = truncated_model.predict(movie_vectors)

        np.save(self.end_space, total_file_name)
예제 #3
0
    def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn,
                 label_names_fn, cluster_names_fn, filename, training_data,
                 cluster_to_classify, max_depth):

        vectors = dt.importVectors(cluster_vectors_fn)
        labels = dt.importLabels(cluster_labels_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(movie_names_fn)
        label_names = dt.importString(label_names_fn)
        scores_array = []
        for l in range(len(labels[0])):
            new_labels = [0] * 15000
            for x in range(len(labels)):
                new_labels[x] = labels[x][l]
            x_train = np.asarray(vectors[:training_data])
            x_test = np.asarray(vectors[training_data:])
            y_train = np.asarray(new_labels[:training_data])
            y_test = np.asarray(new_labels[training_data:])

            self.clf = tree.DecisionTreeClassifier(max_depth=max_depth)
            self.clf = self.clf.fit(x_train, y_train)

            y_pred = self.clf.predict(x_test)
            f1 = f1_score(y_test, y_pred, average='binary')
            accuracy = accuracy_score(y_test, y_pred)
            scores = [[label_names[l], "f1", f1, "accuracy", accuracy]]
            print scores[0]
            scores_array.append(scores)

            class_names = [label_names[l], "NOT " + label_names[l]]
            tree.export_graphviz(self.clf,
                                 feature_names=cluster_names,
                                 class_names=class_names,
                                 out_file='Rules/' + label_names[l] +
                                 filename + '.dot',
                                 max_depth=10)
            """
            rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot')
            new_dot_file = []
            for s in rewrite_dot_file:
                new_string = s
                if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s:
                    index = s.index("value")
                    new_string = s[:index] + '"] ;'
                new_dot_file.append(new_string)
            dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot')
            """
            graph = pydot.graph_from_dot_file('Rules/' + label_names[l] +
                                              filename + '.dot')
            graph.write_png('Rules/Images/' + label_names[l] + filename +
                            ".png")
            self.get_code(self.clf, cluster_names, class_names,
                          label_names[l] + filename)
        dt.write1dArray(scores_array, 'Rules/Scores/' + filename + '.scores')
예제 #4
0
    def __init__(self, directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, fn, percent, percentage_increment, by_vector):

        directions = dt.importVectors(directions_fn)
        vectors = dt.importVectors(vectors_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(vector_names_fn)

        rankings  = self.getRankings(directions, vectors, cluster_names, vector_names)
        rankings = np.array(rankings)
        #labels = self.createLabels(rankings, percent)
        #labels = np.asarray(labels)
        discrete_labels = self.createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
        if by_vector:
            #labels = labels.transpose()
            discrete_labels = discrete_labels.transpose()
            rankings = rankings.transpose()
        #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels")
        dt.write2dArray(rankings, "Rankings/" + fn + ".space")
        dt.write2dArray(discrete_labels, "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete")
        array = []
        short_array = []
        """ Disabled names for quick view now
예제 #5
0
def getKNeighbors(vector_path="filmdata/films200.mds/films200.mds", class_path="filmdata/classesGenres/class-All",
                  n_neighbors=1, algorithm="kd_tree", leaf_size=30,
                  training_data=10000, name="normal200"):
    movie_vectors = np.asarray(dt.importVectors(vector_path))
    movie_labels = np.asarray(dt.importLabels(class_path))

    x_train, y_train, x_test, y_test = dt.splitData(training_data, movie_vectors, movie_labels)

    classifier = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size)
    classifier.fit(x_train, y_train.ravel())
    y_pred = classifier.predict(x_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    dt.write1dArray([f1, accuracy], "KNNScores/" + name + ".score")
    print "F1 " + str(f1), "Accuracy", accuracy
예제 #6
0
    def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn, label_names_fn, cluster_names_fn, filename, training_data, cluster_to_classify, max_depth):

        vectors = dt.importVectors(cluster_vectors_fn)
        labels = dt.importLabels(cluster_labels_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(movie_names_fn)
        label_names = dt.importString(label_names_fn)
        scores_array = []
        for l in range(len(labels[0])):
            new_labels = [0] * 15000
            for x in range(len(labels)):
                new_labels[x] = labels[x][l]
            x_train = np.asarray(vectors[:training_data])
            x_test = np.asarray(vectors[training_data:])
            y_train = np.asarray(new_labels[:training_data])
            y_test = np.asarray(new_labels[training_data:])


            self.clf = tree.DecisionTreeClassifier( max_depth=max_depth)
            self.clf = self.clf.fit(x_train, y_train)

            y_pred = self.clf.predict(x_test)
            f1 = f1_score(y_test, y_pred, average='binary')
            accuracy = accuracy_score(y_test, y_pred)
            scores = [[label_names[l], "f1", f1, "accuracy", accuracy]]
            print scores[0]
            scores_array.append(scores)

            class_names = [ label_names[l], "NOT "+label_names[l]]
            tree.export_graphviz(self.clf, feature_names=cluster_names, class_names=class_names, out_file='Rules/'+label_names[l]+filename+'.dot', max_depth=10)
            """
            rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot')
            new_dot_file = []
            for s in rewrite_dot_file:
                new_string = s
                if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s:
                    index = s.index("value")
                    new_string = s[:index] + '"] ;'
                new_dot_file.append(new_string)
            dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot')
            """
            graph = pydot.graph_from_dot_file('Rules/'+label_names[l]+filename+'.dot')
            graph.write_png('Rules/Images/'+label_names[l]+filename+".png")
            self.get_code(self.clf, cluster_names, class_names, label_names[l]+filename)
        dt.write1dArray(scores_array, 'Rules/Scores/'+filename+'.scores')
예제 #7
0
    def splitDirections(self, directions_fn, scores_fn, names_fn,
                        low_threshold, high_threshold):
        directions = dt.importVectors(directions_fn)
        scores = dt.importString(scores_fn)
        names = dt.importString(names_fn)

        for s in range(len(scores)):
            scores[s] = float(scores[s].strip())

        high_direction_indexes = []
        high_direction_scores = []
        low_direction_indexes = []
        low_direction_scores = []

        for s in range(len(scores)):
            if scores[s] >= high_threshold:
                high_direction_indexes.append(s)
                high_direction_scores.append(scores[s])
            elif scores[s] >= low_threshold:
                low_direction_indexes.append(s)
                low_direction_scores.append(scores[s])

        sorted_h_indexes = dt.sortByArray(high_direction_indexes,
                                          high_direction_scores)
        sorted_l_indexes = dt.sortByArray(low_direction_indexes,
                                          low_direction_scores)
        sorted_h_indexes.reverse()
        sorted_l_indexes.reverse()
        high_direction_names = []
        low_direction_names = []
        high_directions = []
        low_directions = []
        for s in sorted_h_indexes:
            high_directions.append(directions[s])
            high_direction_names.append(names[s][6:])
        for s in sorted_l_indexes:
            low_directions.append(directions[s])
            low_direction_names.append(names[s][6:])

        return high_direction_names, low_direction_names, high_directions, low_directions
예제 #8
0
    def __init__(self, name_distinction="", class_names=None, vector_path=None, class_path=None, class_by_class=True, input_size=200,
                 training_data=10000, amount_of_scores=400,  low_kappa=0.1, high_kappa=0.5, rankSVM=False, amount_to_cut_at=100, largest_cut=21470000):
        print "getting movie data"

        movie_vectors = dt.importVectors(vector_path)
        movie_labels = dt.importLabels(class_path)
        print "getting file names"

        file_names = dt.getFns(class_path[:-10])

        print len(movie_labels), len(movie_labels[0])

        print "getting training and test data"

        x_train = np.asarray(movie_vectors[:training_data])
        x_test = np.asarray(movie_vectors[training_data:])

        movie_labels = zip(*movie_labels)
        file_names, movie_labels = self.getSampledData(file_names, movie_labels, amount_to_cut_at, largest_cut)
        movie_labels = zip(*movie_labels)

        y_train = movie_labels[:training_data]
        y_test = movie_labels[training_data:]
        y_train = np.asarray(zip(*y_train))
        y_test = np.asarray(zip(*y_test))



        print len(y_train), len(y_test), training_data

        print "getting kappa scores"

        kappa_scores, directions =   self.runAllSVMs(y_test, y_train, x_train, x_test, file_names)

        dt.write1dArray(kappa_scores, "SVMResults/"+name_distinction+".scores")
        dt.write1dArray(file_names, "SVMResults/"+name_distinction+".names")

        dt.write2dArray(directions, "directions/"+name_distinction+".directions")
예제 #9
0
    def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold):
        directions = dt.importVectors(directions_fn)
        scores = dt.importString(scores_fn)
        names = dt.importString(names_fn)

        for s in range(len(scores)):
            scores[s] = float(scores[s].strip())

        high_direction_indexes = []
        high_direction_scores = []
        low_direction_indexes = []
        low_direction_scores = []

        for s in range(len(scores)):
            if scores[s] >= high_threshold:
                high_direction_indexes.append(s)
                high_direction_scores.append(scores[s])
            elif scores[s] >= low_threshold:
                low_direction_indexes.append(s)
                low_direction_scores.append(scores[s])

        sorted_h_indexes = dt.sortByArray(high_direction_indexes,   high_direction_scores)
        sorted_l_indexes = dt.sortByArray(low_direction_indexes , low_direction_scores)
        sorted_h_indexes.reverse()
        sorted_l_indexes.reverse()
        high_direction_names = []
        low_direction_names = []
        high_directions = []
        low_directions = []
        for s in sorted_h_indexes:
            high_directions.append(directions[s])
            high_direction_names.append(names[s][6:])
        for s in sorted_l_indexes:
            low_directions.append(directions[s])
            low_direction_names.append(names[s][6:])

        return high_direction_names, low_direction_names, high_directions, low_directions
예제 #10
0
def getKNeighbors(vector_path="filmdata/films200.mds/films200.mds",
                  class_path="filmdata/classesGenres/class-All",
                  n_neighbors=1,
                  algorithm="kd_tree",
                  leaf_size=30,
                  training_data=10000,
                  name="normal200"):
    movie_vectors = np.asarray(dt.importVectors(vector_path))
    movie_labels = np.asarray(dt.importLabels(class_path))

    x_train, y_train, x_test, y_test = dt.splitData(training_data,
                                                    movie_vectors,
                                                    movie_labels)

    classifier = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors,
                                                algorithm=algorithm,
                                                leaf_size=leaf_size)
    classifier.fit(x_train, y_train.ravel())
    y_pred = classifier.predict(x_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    dt.write1dArray([f1, accuracy], "KNNScores/" + name + ".score")
    print "F1 " + str(f1), "Accuracy", accuracy
예제 #11
0
    def __init__(self,
                 epochs=1,
                 learn_rate=0.01,
                 loss="mse",
                 batch_size=1,
                 decay=1e-06,
                 hidden_activation="tanh",
                 layer_init="glorot_uniform",
                 output_activation="tanh",
                 hidden_layer_size=100,
                 file_name="unspecified_filename",
                 vector_path=None,
                 reg=0,
                 optimizer_name="rmsprop",
                 class_names=None,
                 noise=0,
                 output_weights=None):

        # Initialize the model

        self.model = Sequential()

        # Import the numpy vectors
        try:
            movie_vectors = np.asarray(np.load(vector_path))
        except OSError:
            # If it fails, assume that it's in a standard format for vectors and then save it in numpy format
            movie_vectors = dt.importVectors(vector_path)
            movie_vectors = np.asarray(movie_vectors)
            np.save(file_name, movie_vectors)

        # Set the input and the output to be the same size, as this is an auto-encoder

        input_size = len(movie_vectors[0])
        output_size = len(movie_vectors[0])

        if noise > 0:  # If using a noisy autoencoder, add GaussianNoise layers to the start of the encoder
            self.model.add(GaussianNoise(noise, input_shape=(input_size, )))
            self.model.add(
                Dense(output_dim=hidden_layer_size,
                      input_dim=input_size,
                      init=layer_init,
                      activation=hidden_activation,
                      W_regularizer=l2(reg)))
        else:
            # Otherwise just add the hidden layer
            self.model.add(
                Dense(output_dim=hidden_layer_size,
                      input_dim=input_size,
                      init=layer_init,
                      activation=hidden_activation,
                      W_regularizer=l2(reg)))

        # If using custom weights on the hidden layer to the output layer, apply those custom weights. Otherwise just add output layer.
        if output_weights == None:
            self.model.add(
                Dense(output_dim=output_size,
                      init=layer_init,
                      activation=output_activation))
        else:
            self.model.add(
                Dense(output_dim=len(output_weights[0]),
                      init=layer_init,
                      activation=output_activation,
                      weights=output_weights))

        # Compile the model and fit it to the data
        if optimizer_name == "sgd":
            optimizer = SGD(lr=learn_rate, decay=decay)
        elif optimizer_name == "rmsprop":
            optimizer = RMSprop(lr=learn_rate)
        self.model.compile(loss=loss, optimizer=optimizer)
        self.model.fit(movie_vectors,
                       movie_vectors,
                       nb_epoch=epochs,
                       batch_size=batch_size,
                       verbose=1)

        # Create a truncated model that has no output layer that has the same weights as the previous model and use it to obtain the hidden layer representation
        truncated_model = Sequential()
        total_file_name = "newdata/spaces/" + file_name + ".mds"
        truncated_model.add(GaussianNoise(noise, input_shape=(input_size, )))
        truncated_model.add(
            Dense(output_dim=hidden_layer_size,
                  input_dim=input_size,
                  init=layer_init,
                  activation=hidden_activation,
                  W_regularizer=l2(reg)))
        truncated_model.compile(loss=loss, optimizer=optimizer)
        self.end_space = truncated_model.predict(movie_vectors)

        np.save(self.end_space, total_file_name)