def __init__(self, directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, fn, percent, percentage_increment, by_vector): directions = dt.importVectors(directions_fn) vectors = dt.importVectors(vectors_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(vector_names_fn) rankings = self.getRankings(directions, vectors, cluster_names, vector_names) rankings = np.array(rankings) #labels = self.createLabels(rankings, percent) #labels = np.asarray(labels) discrete_labels = self.createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) if by_vector: #labels = labels.transpose() discrete_labels = discrete_labels.transpose() rankings = rankings.transpose() #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels") dt.write2dArray(rankings, "Rankings/" + fn + ".space") dt.write2dArray( discrete_labels, "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete") array = [] short_array = [] """ Disabled names for quick view now
def outputKeywords(): movie_strings = dt.importString("filmdata/filmNames.txt") movie_data = getMovieDataFromIMDB(movie_strings) commonality = 0 common_keywords = getMostCommonKeywords(0, "filmdata/IMDB_movie_data.txt") dt.write1dArray(common_keywords, "filmdata/common_keywords_15k_commanility_" + str(commonality)) vectors = getKeywordVectors(common_keywords, movie_strings, "") dt.write2dArray(vectors, "filmdata/classesKeywords/class-extra-all-commonality-" + str(commonality))
def __init__(self, low_threshold, high_threshold, filename): hdn, ldn, hd, ld = self.splitDirections("Directions/"+filename+".directions", "SVMResults/"+filename+".scores", "SVMResults/"+filename+".names", low_threshold, high_threshold) least_similar_cluster_names, cluster_name_dict, least_similar_clusters = self.createTermClusters(hd, ld, hdn, ldn) dt.write1dArray(least_similar_cluster_names, "Clusters/"+filename+"LeastSimilarHIGH"+str(high_threshold)+","+str(low_threshold)+".names") dt.write2dArray(least_similar_clusters, "Clusters/"+filename+"LeastSimilarHIGH"+str(high_threshold)+","+str(low_threshold)+".clusters") dt.writeArrayDict(cluster_name_dict, "Clusters/"+filename+"MostSimilarCLUSTER"+str(high_threshold)+","+str(low_threshold)+".names")
def __init__(self, low_threshold, high_threshold, filename): hdn, ldn, hd, ld = self.splitDirections( "Directions/" + filename + ".directions", "SVMResults/" + filename + ".scores", "SVMResults/" + filename + ".names", low_threshold, high_threshold) least_similar_cluster_names, cluster_name_dict, least_similar_clusters = self.createTermClusters( hd, ld, hdn, ldn) dt.write1dArray( least_similar_cluster_names, "Clusters/" + filename + "LeastSimilarHIGH" + str(high_threshold) + "," + str(low_threshold) + ".names") dt.write2dArray( least_similar_clusters, "Clusters/" + filename + "LeastSimilarHIGH" + str(high_threshold) + "," + str(low_threshold) + ".clusters") dt.writeArrayDict( cluster_name_dict, "Clusters/" + filename + "MostSimilarCLUSTER" + str(high_threshold) + "," + str(low_threshold) + ".names")
def __init__(self, name_distinction="", class_names=None, vector_path=None, class_path=None, class_by_class=True, input_size=200, training_data=10000, amount_of_scores=400, low_kappa=0.1, high_kappa=0.5, rankSVM=False, amount_to_cut_at=100, largest_cut=21470000): print "getting movie data" movie_vectors = dt.importVectors(vector_path) movie_labels = dt.importLabels(class_path) print "getting file names" file_names = dt.getFns(class_path[:-10]) print len(movie_labels), len(movie_labels[0]) print "getting training and test data" x_train = np.asarray(movie_vectors[:training_data]) x_test = np.asarray(movie_vectors[training_data:]) movie_labels = zip(*movie_labels) file_names, movie_labels = self.getSampledData(file_names, movie_labels, amount_to_cut_at, largest_cut) movie_labels = zip(*movie_labels) y_train = movie_labels[:training_data] y_test = movie_labels[training_data:] y_train = np.asarray(zip(*y_train)) y_test = np.asarray(zip(*y_test)) print len(y_train), len(y_test), training_data print "getting kappa scores" kappa_scores, directions = self.runAllSVMs(y_test, y_train, x_train, x_test, file_names) dt.write1dArray(kappa_scores, "SVMResults/"+name_distinction+".scores") dt.write1dArray(file_names, "SVMResults/"+name_distinction+".names") dt.write2dArray(directions, "directions/"+name_distinction+".directions")
def __init__(self, directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, fn, percent, percentage_increment, by_vector): directions = dt.importVectors(directions_fn) vectors = dt.importVectors(vectors_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(vector_names_fn) rankings = self.getRankings(directions, vectors, cluster_names, vector_names) rankings = np.array(rankings) #labels = self.createLabels(rankings, percent) #labels = np.asarray(labels) discrete_labels = self.createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) if by_vector: #labels = labels.transpose() discrete_labels = discrete_labels.transpose() rankings = rankings.transpose() #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels") dt.write2dArray(rankings, "Rankings/" + fn + ".space") dt.write2dArray(discrete_labels, "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete") array = [] short_array = [] """ Disabled names for quick view now
def outputPhrases(): IDs = dt.importString("filmdata/filmIDs.txt") unique_phrases = dt.importString("filmdata/uniquePhrases.txt") vectors_maintained, vectors = getVectors(IDs, unique_phrases) dt.write2dArray(vectors, "filmdata/classesPhrases/class-all") dt.write2dArray(vectors_maintained, "filmdata/classesPhrases/nonbinary/class-all")
"SVMResults/ALL_SCORES_Phrases_AUTOENCODER0.4tanhtanhmse2tanh2004SDA1.txt", "SVMResults/ALL_NAMES_Phrases_AUTOENCODER0.4tanhtanhmse2tanh2004SDA5.txt", "SVMResults/ALL_SCORES_Phrases_AUTOENCODER0.4tanhtanhmse2tanh2004SDA5.txt", "0.4L1 to 0.4L5") """ def maskingNoise(input, amount_of_corruption): amount_of_corruption = len(input[0]) * amount_of_corruption print amount_of_corruption for x in range(int(amount_of_corruption)): for i in input: r = random.randint(0, len(i)-1) while i[r] == 0: r = random.randint(0, len(i)-1) i[r] = 0 return input """ input_size = 200 dt.write2dArray(makeSpaceNoisy(dt.getMovieVectors(input_size=input_size), 0.25), "newdata/spaces/noise"+str(input_size)+".mds") """ """ Given a list of ordered IDs for the 15,0000 films. And get all of the phrases mentioned in those IDs and return that list. Then, given a list of phrases mentioned, create a vector for every movie and return as an array of those vectors. Information is found within the film files. """ def getUniquePhrases(ordered_IDs, count_min, maintain_quantities): phrase_to_count = {} for ID in ordered_IDs: