def __init__(self, directions_fn, vectors_fn, cluster_names_fn,
                 vector_names_fn, fn, percent, percentage_increment,
                 by_vector):

        directions = dt.importVectors(directions_fn)
        vectors = dt.importVectors(vectors_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(vector_names_fn)

        rankings = self.getRankings(directions, vectors, cluster_names,
                                    vector_names)
        rankings = np.array(rankings)
        #labels = self.createLabels(rankings, percent)
        #labels = np.asarray(labels)
        discrete_labels = self.createDiscreteLabels(rankings,
                                                    percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
        if by_vector:
            #labels = labels.transpose()
            discrete_labels = discrete_labels.transpose()
            rankings = rankings.transpose()
        #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels")
        dt.write2dArray(rankings, "Rankings/" + fn + ".space")
        dt.write2dArray(
            discrete_labels,
            "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete")
        array = []
        short_array = []
        """ Disabled names for quick view now
def outputKeywords():
    movie_strings = dt.importString("filmdata/filmNames.txt")
    movie_data = getMovieDataFromIMDB(movie_strings)
    commonality = 0
    common_keywords = getMostCommonKeywords(0, "filmdata/IMDB_movie_data.txt")
    dt.write1dArray(common_keywords, "filmdata/common_keywords_15k_commanility_" + str(commonality))
    vectors = getKeywordVectors(common_keywords, movie_strings, "")
    dt.write2dArray(vectors, "filmdata/classesKeywords/class-extra-all-commonality-" + str(commonality))
    def __init__(self, low_threshold, high_threshold,  filename):

        hdn, ldn, hd, ld = self.splitDirections("Directions/"+filename+".directions",
                                           "SVMResults/"+filename+".scores",
                                           "SVMResults/"+filename+".names",
                                            low_threshold, high_threshold)

        least_similar_cluster_names, cluster_name_dict, least_similar_clusters = self.createTermClusters(hd, ld, hdn, ldn)

        dt.write1dArray(least_similar_cluster_names, "Clusters/"+filename+"LeastSimilarHIGH"+str(high_threshold)+","+str(low_threshold)+".names")
        dt.write2dArray(least_similar_clusters, "Clusters/"+filename+"LeastSimilarHIGH"+str(high_threshold)+","+str(low_threshold)+".clusters")
        dt.writeArrayDict(cluster_name_dict, "Clusters/"+filename+"MostSimilarCLUSTER"+str(high_threshold)+","+str(low_threshold)+".names")
    def __init__(self, low_threshold, high_threshold, filename):

        hdn, ldn, hd, ld = self.splitDirections(
            "Directions/" + filename + ".directions",
            "SVMResults/" + filename + ".scores",
            "SVMResults/" + filename + ".names", low_threshold, high_threshold)

        least_similar_cluster_names, cluster_name_dict, least_similar_clusters = self.createTermClusters(
            hd, ld, hdn, ldn)

        dt.write1dArray(
            least_similar_cluster_names,
            "Clusters/" + filename + "LeastSimilarHIGH" + str(high_threshold) +
            "," + str(low_threshold) + ".names")
        dt.write2dArray(
            least_similar_clusters,
            "Clusters/" + filename + "LeastSimilarHIGH" + str(high_threshold) +
            "," + str(low_threshold) + ".clusters")
        dt.writeArrayDict(
            cluster_name_dict, "Clusters/" + filename + "MostSimilarCLUSTER" +
            str(high_threshold) + "," + str(low_threshold) + ".names")
    def __init__(self, name_distinction="", class_names=None, vector_path=None, class_path=None, class_by_class=True, input_size=200,
                 training_data=10000, amount_of_scores=400,  low_kappa=0.1, high_kappa=0.5, rankSVM=False, amount_to_cut_at=100, largest_cut=21470000):
        print "getting movie data"

        movie_vectors = dt.importVectors(vector_path)
        movie_labels = dt.importLabels(class_path)
        print "getting file names"

        file_names = dt.getFns(class_path[:-10])

        print len(movie_labels), len(movie_labels[0])

        print "getting training and test data"

        x_train = np.asarray(movie_vectors[:training_data])
        x_test = np.asarray(movie_vectors[training_data:])

        movie_labels = zip(*movie_labels)
        file_names, movie_labels = self.getSampledData(file_names, movie_labels, amount_to_cut_at, largest_cut)
        movie_labels = zip(*movie_labels)

        y_train = movie_labels[:training_data]
        y_test = movie_labels[training_data:]
        y_train = np.asarray(zip(*y_train))
        y_test = np.asarray(zip(*y_test))



        print len(y_train), len(y_test), training_data

        print "getting kappa scores"

        kappa_scores, directions =   self.runAllSVMs(y_test, y_train, x_train, x_test, file_names)

        dt.write1dArray(kappa_scores, "SVMResults/"+name_distinction+".scores")
        dt.write1dArray(file_names, "SVMResults/"+name_distinction+".names")

        dt.write2dArray(directions, "directions/"+name_distinction+".directions")
    def __init__(self, directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, fn, percent, percentage_increment, by_vector):

        directions = dt.importVectors(directions_fn)
        vectors = dt.importVectors(vectors_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(vector_names_fn)

        rankings  = self.getRankings(directions, vectors, cluster_names, vector_names)
        rankings = np.array(rankings)
        #labels = self.createLabels(rankings, percent)
        #labels = np.asarray(labels)
        discrete_labels = self.createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
        if by_vector:
            #labels = labels.transpose()
            discrete_labels = discrete_labels.transpose()
            rankings = rankings.transpose()
        #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels")
        dt.write2dArray(rankings, "Rankings/" + fn + ".space")
        dt.write2dArray(discrete_labels, "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete")
        array = []
        short_array = []
        """ Disabled names for quick view now
def outputPhrases():
    IDs = dt.importString("filmdata/filmIDs.txt")
    unique_phrases = dt.importString("filmdata/uniquePhrases.txt")
    vectors_maintained, vectors = getVectors(IDs, unique_phrases)
    dt.write2dArray(vectors, "filmdata/classesPhrases/class-all")
    dt.write2dArray(vectors_maintained, "filmdata/classesPhrases/nonbinary/class-all")
                    "SVMResults/ALL_SCORES_Phrases_AUTOENCODER0.4tanhtanhmse2tanh2004SDA1.txt",
                    "SVMResults/ALL_NAMES_Phrases_AUTOENCODER0.4tanhtanhmse2tanh2004SDA5.txt",
                    "SVMResults/ALL_SCORES_Phrases_AUTOENCODER0.4tanhtanhmse2tanh2004SDA5.txt",
                    "0.4L1 to 0.4L5")
"""
def maskingNoise(input, amount_of_corruption):
    amount_of_corruption = len(input[0]) * amount_of_corruption
    print amount_of_corruption
    for x in range(int(amount_of_corruption)):
        for i in input:
            r = random.randint(0, len(i)-1)
            while i[r] == 0:
                r = random.randint(0, len(i)-1)
            i[r] = 0
    return input
"""
input_size = 200
dt.write2dArray(makeSpaceNoisy(dt.getMovieVectors(input_size=input_size), 0.25), "newdata/spaces/noise"+str(input_size)+".mds")
"""
"""
Given a list of ordered IDs for the 15,0000 films.
And get all of the phrases mentioned in those IDs and return that list.

Then, given a list of phrases mentioned, create a vector for every movie and return
as an array of those vectors. Information is found within the film files.

"""

def getUniquePhrases(ordered_IDs, count_min, maintain_quantities):
    phrase_to_count = {}
    for ID in ordered_IDs: