def __init__(self, discrete_labels_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) counter = 0 with open(discrete_labels_fn) as f: for line in f: exists = True if phrases_to_check_fn != "": exists = False for i in indexes_to_get: if i == counter: exists = True break if exists: discrete_labels = line.split() saveGraph(discrete_labels, ppmi[counter], fn + " " + phrases[counter][6:]) print phrases[counter] counter += 1
def getMatchedLines(file_name, lines_to_match): matched_lines = [] failed_lines = [] match_names = [] match_years = [] for line in lines_to_match: match_names.append(re.split(r'\t+', line)[0]) match_years.append(re.split(r'\t+', line)[1]) file = open(file_name, "r") lines = file.readlines() for i in range(len(lines_to_match)): matched = False last_movie = "" for l in range(len(lines)): if matched is True and re.split(r'\t+', lines[l])[0] != last_movie: break split_line = re.split(r'\t+', lines[l]) split_line[0] = re.sub(r'\s+', '', split_line[0].translate(None, string.punctuation).lower()) match_names[i] = re.sub(r'\s+', '', match_names[i].translate(None, string.punctuation).lower()) if split_line[0] == match_names[i]: matched_lines.append(lines[l]) matched = True last_movie = re.split(r'\t+', lines[l])[0] print "Found a line for " + last_movie continue if matched: print "Matched", lines_to_match[i] else: failed_lines.append(lines_to_match[i]) print "Failed", lines_to_match[i] dt.write1dArray(failed_lines, "filmdata/KeywordData/failed_second_match.txt") dt.write1dArray(matched_lines, "filmdata/KeywordData/matched_lines_NEW.txt")
def getScoreDifferences(name_word_file1, name_score_file1, name_word_file2, name_score_file2, name): word_file1 = open(name_word_file1, "r") score_file1 = open(name_score_file1, "r") word_lines1 = word_file1.readlines() score_lines1 = score_file1.readlines() scores1 = [] words1 = [] for s in score_lines1: scores1.append(float(s.strip())) for w in word_lines1: words1.append(w.strip()) word_file2 = open(name_word_file2, "r") score_file2 = open(name_score_file2, "r") word_lines2 = word_file2.readlines() score_lines2 = score_file2.readlines() scores2 = [] words2 = [] for s in score_lines2: scores2.append(float(s)) for w in word_lines2: words2.append(w.strip()) differences_list = [] for i in range(len(score_lines1)): differences_list.append(scores1[i] - scores2[i]) most_different_words = [x for (y,x) in sorted(zip(differences_list,words1))] differences_list = sorted(differences_list) dt.write1dArray(most_different_words, "filmdata/SVM/most_different_words_" + name + ".txt") dt.write1dArray(differences_list, "filmdata/SVM/most_different_values_" + name + ".txt")
def getVectors(ordered_IDs, unique_phrases): vectors = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))] vectors_maintained = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))] multi_dictionary = {} dict_mapping = {} print "Mapping to memory." for i in range(len(ordered_IDs)): ordered_IDs[i] = str(ordered_IDs[i]) for i in range(len(ordered_IDs)): ordered_IDs[i] = ordered_IDs[i].strip() dict_mapping[ordered_IDs[i]] = i if ordered_IDs[i] != "-1": file = open("filmdata/Tokens/" + ordered_IDs[i] + ".film", "r") lines = file.readlines()[1:] for line in lines: split_line = line.split() multi_dictionary[(ordered_IDs[i], split_line[0])] = int(split_line[1]) file.close() else: multi_dictionary[(ordered_IDs[i], split_line[0])] = 0 for up in range(len(unique_phrases)): unique_phrases[up] = unique_phrases[up].strip() print len("Iterating over memory.") for p in range(13177, 25842, 1): for key, value in multi_dictionary.iteritems(): if key[1] == unique_phrases[p]: vectors_maintained[p][dict_mapping[key[0].strip()]] = value vectors[p][dict_mapping[key[0]]] = 1 print unique_phrases[p] dt.write1dArray(vectors_maintained[p], "filmdata/classesPhrases/nonbinary/class-" + unique_phrases[p]) dt.write1dArray(vectors[p], "filmdata/classesPhrases/class-" + unique_phrases[p]) return vectors_maintained, vectors
def findMissingKeywords(file_name, common_keywords): print "?" file = open(file_name, "r") lines = file.readlines() last_film = "" movie_strings = dt.importString("filmdata/filmNames.txt") standard_strings = [] indexes = [] for m in movie_strings: m = m[:-5] standard_strings.append(m.translate(None, string.punctuation).replace(" ", "").strip().upper()) for line in lines: film_vectors = [] line = line.strip() if len(line) > 2: line_split = re.split(r'\t+', line) line_split[0] = line_split[0].translate(None, string.punctuation).replace(" ", "").strip().upper() file_save = "" for m in range(len(standard_strings)): if standard_strings[m] == line_split[0]: print "matched", m, standard_strings[m], line_split[0] file_save = str(m) break if file_save != "": if last_film.strip() != line_split[0].strip() and last_film is not None: print "Succeeded", line_split[0] for m in range(len(standard_strings)): if standard_strings[m] == last_film: indexes.append(m) break last_film = line_split[0] else: print "Failed", line_split[0], dt.write1dArray(indexes, "filmdata/MISSING_FROM_MOVIEDATA.txt")
def getIDs(movie_strings): ordered_IDs = [] movie_names = [] for name in movie_strings: movie_names.append(name[:-5]) id_mappings = open("filmdata/KeywordData/Movie_Most_Common_Keyword_Mapping/films-ids.txt", "r") id_mappings_lines = id_mappings.readlines() found_name = False failed_names = [] x = 0 for name in movie_names: for line in id_mappings_lines: mapping_id = line.split()[0] mapping_name = re.split(r'\t+', line)[2] if similar(name.upper().strip(), mapping_name.upper().strip()): ordered_IDs.append(mapping_id) found_name = True break if found_name is True: found_name = False else: failed_names.append(name) ordered_IDs.append(-1) x += 1 print x dt.write1dArray(failed_names, "filmdata/KeywordData/NAMES_THAT_FAILED_IDS.txt") dt.write1dArray(ordered_IDs, "filmdata/KeywordData/IDsByOriginalOrdering.txt")
def outputTopByVotes(amount_of_votes): file = open("filmdata/ratings.list/ratings.list", "r") lines = file.readlines() top_movies = [] top_ratings = [] for line in lines: top_ratings.append(int(line.split()[1])) top_ratings = np.asarray(top_ratings) indices = np.argpartition(top_ratings, -amount_of_votes)[-amount_of_votes:] for i in indices: just_movie = lines[i].split()[3:] just_movie = " ".join(just_movie) just_movie = just_movie.split('{') just_movie = just_movie[0] just_movie = just_movie.split('(') try: if not re.findall(r'\d+', just_movie[2])[0]: del just_movie[2] else: just_movie[0] = just_movie[0] + "(" + just_movie[1] del just_movie[1] except IndexError: print try: year = re.findall(r'\d+', just_movie[1])[0] except IndexError: print "FALED", just_movie if just_movie[0].endswith(' '): just_movie[0] = just_movie[0][:-1] if just_movie[0].startswith('"') and just_movie[0].endswith('"'): just_movie[0] = just_movie[0][1:-1] just_movie = just_movie[0] + " " + str(year) print just_movie top_movies.append(just_movie) dt.write1dArray(top_movies, "filmdata/top50000moviesbyvotes.txt")
def getVectorsKeywords(movie_strings, keywords): multi_dictionary = {} dict_mapping = {} movie_names = [] file_names = dt.getAllFileNames("filmdata\KeywordData\Movie_Most_Common_Keyword_Mapping") for i in movie_strings: movie_names.append(i.strip()[:-5]) print i print "Mapping to memory." for i in file_names: try: file = open("filmdata/KeywordData/Movie_Most_Common_Keyword_Mapping/" +i, "r") lines = file.readlines() dict_mapping[movie_strings[int(i)]] = i for line in lines: line = line.strip() multi_dictionary[(movie_strings[int(i)], line)] = 1 file.close() except IOError: print movie_names[i] for up in range(len(keywords)): keywords[up] = keywords[up].strip() print len("Iterating over memory.") for p in range(len(keywords)): vector = [0 for x in range(len(movie_strings))] print len(vector) for key, value in multi_dictionary.iteritems(): if key[1] == keywords[p]: #print int(dict_mapping[key[0]]) vector[int(dict_mapping[key[0]])] = 1 print keywords[p] dt.write1dArray(vector, "filmdata/classesKeywords/NewData/class-" + keywords[p])
def outputKeywords(): movie_strings = dt.importString("filmdata/filmNames.txt") movie_data = getMovieDataFromIMDB(movie_strings) commonality = 0 common_keywords = getMostCommonKeywords(0, "filmdata/IMDB_movie_data.txt") dt.write1dArray(common_keywords, "filmdata/common_keywords_15k_commanility_" + str(commonality)) vectors = getKeywordVectors(common_keywords, movie_strings, "") dt.write2dArray(vectors, "filmdata/classesKeywords/class-extra-all-commonality-" + str(commonality))
def trimMovieData(file_string): new_movie_data = [] movie_data_file = open(file_string) with movie_data_file as myFile: for num, line in enumerate(myFile, 1): if "{" not in line and "(V)" not in line and "(TV)" not in line and "(VG)" not in line and len(line) > 2 and line.startswith(" ") is False and line.startswith("\t") is False and line.startswith("\n") is False: new_movie_data.append(line[:-1]) print "dun" dt.write1dArray(new_movie_data, file_string + "trimmed")
def getMissingIndexes(index_list, length): full_index = range(length) for i in index_list: i = int(i) full_index[i] = -1 missing_indexes = [] for i in full_index: if i > -1: missing_indexes.append(i) dt.write1dArray(missing_indexes, "filmdata/missing_indexes_keywords.txt")
def getIMDBKeywordsForMovieNames(movie_names): stripped_movie_names = [] for movie in movie_names: stripped_movie_names.append(movie.replace('\n', '')) stripped_movie_names = sorted(stripped_movie_names) split_names = [] split_years = [] for stripped_movie_name in stripped_movie_names: split = stripped_movie_name.split() split_year = split[len(split)-1] split_years.append(split_year) split_names.append(stripped_movie_name[:-len(split_year)-1]) file = open("filmdata\keywords.list\keywords.list", "r") lines = file.readlines() keywords_list = lines[79748:] matched_lines = [] x = 0 last_line = keywords_list[0] matched = False while x < 50000: for line in keywords_list: split_line = line.rsplit('(', 2) movie_name = split_line[0].rstrip() if movie_name.startswith('"') and movie_name.endswith('"'): movie_name = movie_name[1:-1] try: movie_year = str(re.findall(r'\d+', split_line[1])[0]) except IndexError: movie_year = "NULL" if not movie_name: movie_name = "'NULL" formatted_line = movie_name.rstrip() + " " + str(movie_year).rstrip() if matched is True and formatted_line == last_line: matched_lines.append(line) print split_names[x], line elif matched is False and similar(movie_name.strip().upper(), split_names[x].strip().upper()) and movie_year == split_years[x]: matched = True matched_lines.append(line) print split_names[x], line elif matched is True and formatted_line != last_line: matched = False x = x + 1 last_line = formatted_line print "cycled through" print "Found:", x dt.write1dArray(matched_lines, "filmdata/imdb_movie_keywords.txt")
def getXLeastSimilarIndex(term, terms_to_match, terms_to_ignore, amt): least_similar_term_indexes = [] for a in range(amt): lowest_term = 99999999 term_index = 0 for t in range(len(terms_to_match)): if dt.checkIfInArray(terms_to_ignore, t) is False: s = getSimilarity(term, terms_to_match[t]) if s < lowest_term and dt.checkIfInArray(least_similar_term_indexes, t) is False: lowest_term = s term_index = t least_similar_term_indexes.append(term_index) return least_similar_term_indexes
def getXMostSimilarIndex(term, terms_to_match, terms_to_ignore, amt): most_similar_term_indexes = [] for a in range(amt): highest_term = 0 term_index = 0 for t in range(len(terms_to_match)): if dt.checkIfInArray(terms_to_ignore, t) is False: s = getSimilarity(term, terms_to_match[t]) if s > highest_term and dt.checkIfInArray(most_similar_term_indexes, t) is False: highest_term = s term_index = t most_similar_term_indexes.append(term_index) return most_similar_term_indexes
def __init__(self, directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, fn, percent, percentage_increment, by_vector): directions = dt.importVectors(directions_fn) vectors = dt.importVectors(vectors_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(vector_names_fn) rankings = self.getRankings(directions, vectors, cluster_names, vector_names) rankings = np.array(rankings) #labels = self.createLabels(rankings, percent) #labels = np.asarray(labels) discrete_labels = self.createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) if by_vector: #labels = labels.transpose() discrete_labels = discrete_labels.transpose() rankings = rankings.transpose() #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels") dt.write2dArray(rankings, "Rankings/" + fn + ".space") dt.write2dArray( discrete_labels, "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete") array = [] short_array = [] """ Disabled names for quick view now
def getXMostSimilarIndex(term, terms_to_match, terms_to_ignore, amt): most_similar_term_indexes = [] for a in range(amt): highest_term = 0 term_index = 0 for t in range(len(terms_to_match)): if dt.checkIfInArray(terms_to_ignore, t) is False: s = getSimilarity(term, terms_to_match[t]) if s > highest_term and dt.checkIfInArray( most_similar_term_indexes, t) is False: highest_term = s term_index = t most_similar_term_indexes.append(term_index) return most_similar_term_indexes
def getXLeastSimilarIndex(term, terms_to_match, terms_to_ignore, amt): least_similar_term_indexes = [] for a in range(amt): lowest_term = 99999999 term_index = 0 for t in range(len(terms_to_match)): if dt.checkIfInArray(terms_to_ignore, t) is False: s = getSimilarity(term, terms_to_match[t]) if s < lowest_term and dt.checkIfInArray( least_similar_term_indexes, t) is False: lowest_term = s term_index = t least_similar_term_indexes.append(term_index) return least_similar_term_indexes
def getUnformattedTopByVotes(amount_of_votes): file = open("filmdata/ratings.list/ratings.list", "r") lines = file.readlines() top_movies = [] top_ratings = [] for line in lines: top_ratings.append(int(line.split()[1])) top_ratings = np.asarray(top_ratings) indices = np.argpartition(top_ratings, -amount_of_votes)[-amount_of_votes:] for i in indices: just_movie = lines[i].split()[3:] just_movie = " ".join(just_movie) print just_movie top_movies.append(just_movie) dt.write1dArray(top_movies, "filmdata/imdb_formatted_top50000.txt")
def reMapPPMI(ordered_IDs, file_names): print "Mapping to memory." for i in range(len(ordered_IDs)): ordered_IDs[i] = str(ordered_IDs[i]) for i in range(len(ordered_IDs)): for f in range(len(file_names)): id = file_names[f].split(".")[0] if int(ordered_IDs[i]) == int(id) and int(ordered_IDs[i]) != -1: print ordered_IDs[i], id file = open("filmdata/vectors/Tokens/" + file_names[f]) lines = file.readlines() dt.write1dArray(lines, "filmdata/NewTokens/"+str(i)+".ppmi") file.close() elif int(ordered_IDs[i]) == -1: dt.write1dArray([[""]], "filmdata/NewTokens/"+str(i)+".error")
def makeConsistent(file_name, new_file_name): new_file = [] with open(file_name) as my_file: for num, line in enumerate(my_file, 1): line = line.strip() name = line[:-4] year = line[len(line)-4:] name = name.translate(None, string.punctuation) year = year.translate(None, string.punctuation) new_line = "\t".join([name, year]) new_file.append(new_line) print new_line dt.write1dArray(new_file, new_file_name)
def makeKeywordPPMIVectors(file_name, common_keywords): print "?" file = open(file_name, "r") lines = file.readlines() last_film = "" movie_strings = dt.importString("filmdata/filmNames.txt") standard_strings = [] for m in movie_strings: m = m[:-5] standard_strings.append(m.translate(None, string.punctuation).replace(" ", "").strip().upper()) for line in lines: film_vectors = [] line = line.strip() if len(line) > 2: line_split = re.split(r'\t+', line) line_split[0] = line_split[0].translate(None, string.punctuation).replace(" ", "").strip().upper() file_save = "" for m in range(len(standard_strings)): if standard_strings[m] == line_split[0]: file_save = str(m) break if file_save != "": file = open("filmdata\KeywordData\Movie_Most_Common_Keyword_Mapping\\" + file_save, "a") for keyword in common_keywords: if line_split[2] == keyword.strip(): film_vectors.append("line") file.write(keyword) break if last_film.strip() != line_split[0].strip() and last_film is not None: print "Succeeded", line_split[0] file.close() last_film = line_split[0] else: print "Failed", line_split[0]
def getKNeighbors(vector_path="filmdata/films200.mds/films200.mds", class_path="filmdata/classesGenres/class-All", n_neighbors=1, algorithm="kd_tree", leaf_size=30, training_data=10000, name="normal200"): movie_vectors = np.asarray(dt.importVectors(vector_path)) movie_labels = np.asarray(dt.importLabels(class_path)) x_train, y_train, x_test, y_test = dt.splitData(training_data, movie_vectors, movie_labels) classifier = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size) classifier.fit(x_train, y_train.ravel()) y_pred = classifier.predict(x_test) f1 = f1_score(y_test, y_pred, average='macro') accuracy = accuracy_score(y_test, y_pred) dt.write1dArray([f1, accuracy], "KNNScores/" + name + ".score") print "F1 " + str(f1), "Accuracy", accuracy
def writeMissing(folder_name): print "?" file_names = dt.getAllFileNames(folder_name) standard = range(15000) missing = [] for i in standard: found = False for f in file_names: if int(f) == int(i): found = True break if found: print "found", i else: missing.append(i) print "no found", i dt.write1dArray(missing, "filmdata/MISSING_KEYWORD_ITEMS.txt")
def get_code(self, tree, feature_names, class_names, filename): left = tree.tree_.children_left right = tree.tree_.children_right threshold = tree.tree_.threshold value = tree.tree_.value #print tree.tree_.feature, len(tree.tree_.feature # ) features = [] for i in tree.tree_.feature: if i != -2 or i <= 200: features.append(feature_names[i]) rules_array = [] def recurse(left, right, threshold, features, node): if (threshold[node] != -2): line = "IF ( " + features[node] + " <= " + str( threshold[node]) + " ) {" rules_array.append(line) if left[node] != -1: recurse(left, right, threshold, features, left[node]) line = "} ELSE {" rules_array.append(line) if right[node] != -1: recurse(left, right, threshold, features, right[node]) line = "}" rules_array.append(line) else: if value[node][0][0] >= value[node][0][1]: line = "return", class_names[0] rules_array.append(line) else: line = "return", class_names[1] rules_array.append(line) recurse(left, right, threshold, features, 0) dt.write1dArray(rules_array, "Rules/Statements/" + filename + ".rules") cleaned = jsbeautifier.beautify_file("Rules/Statements/" + filename + ".rules") file = open("Rules/Statements/" + filename + ".rules", "w") file.write(cleaned) file.close()
def getMostCommonKeywords(top_value, file_name, keyword_file, value_file): common_keywords = [] file = open(file_name, "r") lines = file.readlines() keywords = defaultdict(int) for line in lines: if len(line.split()) > 0: line_split = line.split() keyword = line_split[len(line_split)-1] keywords[keyword] += 1 print line sorted_dict = sorted(keywords.iteritems(), key=lambda x:-x[1])[:top_value] print sorted_dict keys = [] values = [] for key, value in sorted_dict: keys.append(key) values.append(value) dt.write1dArray(keys, keyword_file) dt.write1dArray(values, value_file)
def getKNearestMovies(data, x, k): movie_names = dt.importString("filmdata/filmNames.txt") kd_tree = spatial.KDTree(data) kd_query = kd_tree.query(x=x, k=k) nearest_distances = kd_query[0][1:] k_nearest = kd_query[1][1:] nearest_movies = [] for k in k_nearest: nearest_movies.append(movie_names[k].strip()) print nearest_movies return nearest_movies, nearest_distances
def __init__(self, epochs=1, learn_rate=0.01, loss="mse", batch_size=1, decay=1e-06, hidden_activation="tanh", layer_init="glorot_uniform", output_activation="tanh", hidden_layer_size=100, file_name="unspecified_filename", vector_path=None, reg=0, optimizer_name="rmsprop", class_names=None, noise=0, output_weights=None): # Initialize the model self.model = Sequential() # Import the numpy vectors try: movie_vectors = np.asarray(np.load(vector_path)) except OSError: # If it fails, assume that it's in a standard format for vectors and then save it in numpy format movie_vectors = dt.importVectors(vector_path) movie_vectors = np.asarray(movie_vectors) np.save(file_name, movie_vectors) # Set the input and the output to be the same size, as this is an auto-encoder input_size = len(movie_vectors[0]) output_size = len(movie_vectors[0]) if noise > 0: # If using a noisy autoencoder, add GaussianNoise layers to the start of the encoder self.model.add(GaussianNoise(noise, input_shape=(input_size,))) self.model.add(Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation,W_regularizer=l2(reg))) else: # Otherwise just add the hidden layer self.model.add(Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation,W_regularizer=l2(reg))) # If using custom weights on the hidden layer to the output layer, apply those custom weights. Otherwise just add output layer. if output_weights == None: self.model.add(Dense(output_dim=output_size, init=layer_init, activation=output_activation)) else: self.model.add(Dense(output_dim=len(output_weights[0]), init=layer_init, activation=output_activation, weights=output_weights)) # Compile the model and fit it to the data if optimizer_name == "sgd": optimizer = SGD(lr=learn_rate, decay=decay) elif optimizer_name == "rmsprop": optimizer = RMSprop(lr=learn_rate) self.model.compile(loss=loss, optimizer=optimizer) self.model.fit(movie_vectors, movie_vectors, nb_epoch=epochs, batch_size=batch_size, verbose=1) # Create a truncated model that has no output layer that has the same weights as the previous model and use it to obtain the hidden layer representation truncated_model = Sequential() total_file_name = "newdata/spaces/" + file_name +".mds" truncated_model.add(GaussianNoise(noise, input_shape=(input_size,))) truncated_model.add(Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation, W_regularizer=l2(reg))) truncated_model.compile(loss=loss, optimizer=optimizer) self.end_space = truncated_model.predict(movie_vectors) np.save(self.end_space, total_file_name)
def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold): directions = dt.importVectors(directions_fn) scores = dt.importString(scores_fn) names = dt.importString(names_fn) for s in range(len(scores)): scores[s] = float(scores[s].strip()) high_direction_indexes = [] high_direction_scores = [] low_direction_indexes = [] low_direction_scores = [] for s in range(len(scores)): if scores[s] >= high_threshold: high_direction_indexes.append(s) high_direction_scores.append(scores[s]) elif scores[s] >= low_threshold: low_direction_indexes.append(s) low_direction_scores.append(scores[s]) sorted_h_indexes = dt.sortByArray(high_direction_indexes, high_direction_scores) sorted_l_indexes = dt.sortByArray(low_direction_indexes, low_direction_scores) sorted_h_indexes.reverse() sorted_l_indexes.reverse() high_direction_names = [] low_direction_names = [] high_directions = [] low_directions = [] for s in sorted_h_indexes: high_directions.append(directions[s]) high_direction_names.append(names[s][6:]) for s in sorted_l_indexes: low_directions.append(directions[s]) low_direction_names.append(names[s][6:]) return high_direction_names, low_direction_names, high_directions, low_directions
def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn, label_names_fn, cluster_names_fn, filename, training_data, cluster_to_classify, max_depth): vectors = dt.importVectors(cluster_vectors_fn) labels = dt.importLabels(cluster_labels_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(movie_names_fn) label_names = dt.importString(label_names_fn) scores_array = [] for l in range(len(labels[0])): new_labels = [0] * 15000 for x in range(len(labels)): new_labels[x] = labels[x][l] x_train = np.asarray(vectors[:training_data]) x_test = np.asarray(vectors[training_data:]) y_train = np.asarray(new_labels[:training_data]) y_test = np.asarray(new_labels[training_data:]) self.clf = tree.DecisionTreeClassifier(max_depth=max_depth) self.clf = self.clf.fit(x_train, y_train) y_pred = self.clf.predict(x_test) f1 = f1_score(y_test, y_pred, average='binary') accuracy = accuracy_score(y_test, y_pred) scores = [[label_names[l], "f1", f1, "accuracy", accuracy]] print scores[0] scores_array.append(scores) class_names = [label_names[l], "NOT " + label_names[l]] tree.export_graphviz(self.clf, feature_names=cluster_names, class_names=class_names, out_file='Rules/' + label_names[l] + filename + '.dot', max_depth=10) """ rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot') new_dot_file = [] for s in rewrite_dot_file: new_string = s if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s: index = s.index("value") new_string = s[:index] + '"] ;' new_dot_file.append(new_string) dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot') """ graph = pydot.graph_from_dot_file('Rules/' + label_names[l] + filename + '.dot') graph.write_png('Rules/Images/' + label_names[l] + filename + ".png") self.get_code(self.clf, cluster_names, class_names, label_names[l] + filename) dt.write1dArray(scores_array, 'Rules/Scores/' + filename + '.scores')
def getMovieDataFromIMDB(movie_strings): movie_data = [] write_line_file = open("filmdata/Found_Missing.txt", "w") failed_movies = [] names = [] years = [] for movie_string in movie_strings: names.append(movie_string[:-6]) years.append(movie_string[-5:].strip()) for n in range(len(names)): found = False last_name = "" old_num = 0 with open("filmdata/IMDB_movie_data.txttrimmed") as myFile: for num, line in enumerate(myFile, 1): num = old_num split_line = re.split(r'\t+', line) movie_string = split_line[0] movie_name = movie_string.split() del movie_name[len(movie_name)-1] movie_name = " ".join(movie_name) if found is True and last_name != movie_name: break movie_year = int(re.findall(r'\d+', movie_string.split()[len(movie_string.split())-1])[0]) if similar(names[n].upper().strip(), movie_name.upper().strip()) and int(years[n]) == int(movie_year): movie_data.append(line) write_line_file.write(line) found = True old_num = num last_name = movie_name if found is False: failed_movies.append(names[n]) print "FAILED:", names[n] print "Total failed", len(failed_movies) write_line_file.close() dt.write1dArray(failed_movies, "filmdata/failed_movies_final_push.txt") return movie_data
def get_code(self, tree, feature_names, class_names, filename): left = tree.tree_.children_left right = tree.tree_.children_right threshold = tree.tree_.threshold value = tree.tree_.value #print tree.tree_.feature, len(tree.tree_.feature # ) features = [] for i in tree.tree_.feature: if i != -2 or i <= 200: features.append(feature_names[i]) rules_array = [] def recurse(left, right, threshold, features, node): if (threshold[node] != -2): line = "IF ( " + features[node] + " <= " + str(threshold[node]) + " ) {" rules_array.append(line) if left[node] != -1: recurse (left, right, threshold, features,left[node]) line = "} ELSE {" rules_array.append(line) if right[node] != -1: recurse (left, right, threshold, features,right[node]) line = "}" rules_array.append(line) else: if value[node][0][0] >= value[node][0][1]: line = "return", class_names[0] rules_array.append(line) else: line = "return", class_names[1] rules_array.append(line) recurse(left, right, threshold, features, 0) dt.write1dArray(rules_array, "Rules/Statements/"+filename+".rules") cleaned = jsbeautifier.beautify_file("Rules/Statements/"+filename+".rules") file = open("Rules/Statements/"+filename+".rules", "w") file.write(cleaned) file.close()
def getNextClusterTerm(cluster_terms, terms_to_match, terms_to_ignore, amt): min_value = 999999999999999 min_index = 0 for t in range(len(terms_to_match)): max_value = 0 if dt.checkIfInArray(terms_to_ignore, t) is False: for c in range(len(cluster_terms)): s = getSimilarity(cluster_terms[c], terms_to_match[t]) if s > max_value: max_value = s if max_value < min_value: min_value = max_value min_index = t return min_index
def makeConsistentKeywords(file_name, new_file_name): new_file = [] with open(file_name) as my_file: for num, line in enumerate(my_file, 1): if "{" in line or "?" in line: continue split_line = re.split(r'\t+', line) split_on_bracket = split_line[0].split(" (") if split_on_bracket[1].startswith("1") == False and split_on_bracket[1].startswith("2") == False: year = split_on_bracket[2][:4] name = "".join([split_on_bracket[0], split_on_bracket[1]]) else: year = split_on_bracket[1][:4] name = split_on_bracket[0] name = name.translate(None, string.punctuation) year = year.translate(None, string.punctuation) keyword = re.sub(r'\s+', '', split_line[1]).translate(None, string.punctuation) name_and_year = "\t".join([name, year]) new_line = "\t".join([name_and_year, keyword]) new_file.append(new_line) print new_line dt.write1dArray(new_file, new_file_name)
def getVectorsIO(ordered_IDs, unique_phrases): vectors = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))] vectors_maintained = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))] for p in range(11212, 25842, 1): unique_phrases[p] = unique_phrases[p].strip() for i in range(len(ordered_IDs)): ordered_IDs[i] = ordered_IDs[i].strip() if ordered_IDs[i] != "-1": file = open("filmdata/Tokens/" + ordered_IDs[i] + ".film", "r") lines = file.readlines()[1:] for line in lines: split_line = line.split() split_line[1] = split_line[1].strip() if split_line[0] == p: vectors_maintained[p][i] = split_line[1] vectors[p][i] = 1 file.close() print unique_phrases[p] dt.write1dArray(vectors_maintained[p], "filmdata/classesPhrases/nonbinary/class-" + unique_phrases[p]) dt.write1dArray(vectors[p], "filmdata/classesPhrases/class-" + unique_phrases[p]) return vectors_maintained, vectors
def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold): directions = dt.importVectors(directions_fn) scores = dt.importString(scores_fn) names = dt.importString(names_fn) for s in range(len(scores)): scores[s] = float(scores[s].strip()) high_direction_indexes = [] high_direction_scores = [] low_direction_indexes = [] low_direction_scores = [] for s in range(len(scores)): if scores[s] >= high_threshold: high_direction_indexes.append(s) high_direction_scores.append(scores[s]) elif scores[s] >= low_threshold: low_direction_indexes.append(s) low_direction_scores.append(scores[s]) sorted_h_indexes = dt.sortByArray(high_direction_indexes, high_direction_scores) sorted_l_indexes = dt.sortByArray(low_direction_indexes , low_direction_scores) sorted_h_indexes.reverse() sorted_l_indexes.reverse() high_direction_names = [] low_direction_names = [] high_directions = [] low_directions = [] for s in sorted_h_indexes: high_directions.append(directions[s]) high_direction_names.append(names[s][6:]) for s in sorted_l_indexes: low_directions.append(directions[s]) low_direction_names.append(names[s][6:]) return high_direction_names, low_direction_names, high_directions, low_directions
def __init__(self, name_distinction="", class_names=None, vector_path=None, class_path=None, class_by_class=True, input_size=200, training_data=10000, amount_of_scores=400, low_kappa=0.1, high_kappa=0.5, rankSVM=False, amount_to_cut_at=100, largest_cut=21470000): print "getting movie data" movie_vectors = dt.importVectors(vector_path) movie_labels = dt.importLabels(class_path) print "getting file names" file_names = dt.getFns(class_path[:-10]) print len(movie_labels), len(movie_labels[0]) print "getting training and test data" x_train = np.asarray(movie_vectors[:training_data]) x_test = np.asarray(movie_vectors[training_data:]) movie_labels = zip(*movie_labels) file_names, movie_labels = self.getSampledData(file_names, movie_labels, amount_to_cut_at, largest_cut) movie_labels = zip(*movie_labels) y_train = movie_labels[:training_data] y_test = movie_labels[training_data:] y_train = np.asarray(zip(*y_train)) y_test = np.asarray(zip(*y_test)) print len(y_train), len(y_test), training_data print "getting kappa scores" kappa_scores, directions = self.runAllSVMs(y_test, y_train, x_train, x_test, file_names) dt.write1dArray(kappa_scores, "SVMResults/"+name_distinction+".scores") dt.write1dArray(file_names, "SVMResults/"+name_distinction+".names") dt.write2dArray(directions, "directions/"+name_distinction+".directions")
def __init__(self, low_threshold, high_threshold, filename): hdn, ldn, hd, ld = self.splitDirections( "Directions/" + filename + ".directions", "SVMResults/" + filename + ".scores", "SVMResults/" + filename + ".names", low_threshold, high_threshold) least_similar_cluster_names, cluster_name_dict, least_similar_clusters = self.createTermClusters( hd, ld, hdn, ldn) dt.write1dArray( least_similar_cluster_names, "Clusters/" + filename + "LeastSimilarHIGH" + str(high_threshold) + "," + str(low_threshold) + ".names") dt.write2dArray( least_similar_clusters, "Clusters/" + filename + "LeastSimilarHIGH" + str(high_threshold) + "," + str(low_threshold) + ".clusters") dt.writeArrayDict( cluster_name_dict, "Clusters/" + filename + "MostSimilarCLUSTER" + str(high_threshold) + "," + str(low_threshold) + ".names")
def __init__(self, epochs=1, learn_rate=0.01, loss="mse", batch_size=1, decay=1e-06, hidden_activation="tanh", layer_init="glorot_uniform", output_activation="tanh", hidden_layer_size=100, file_name="unspecified_filename", vector_path=None, reg=0, optimizer_name="rmsprop", class_names=None, noise=0, output_weights=None): # Initialize the model self.model = Sequential() # Import the numpy vectors try: movie_vectors = np.asarray(np.load(vector_path)) except OSError: # If it fails, assume that it's in a standard format for vectors and then save it in numpy format movie_vectors = dt.importVectors(vector_path) movie_vectors = np.asarray(movie_vectors) np.save(file_name, movie_vectors) # Set the input and the output to be the same size, as this is an auto-encoder input_size = len(movie_vectors[0]) output_size = len(movie_vectors[0]) if noise > 0: # If using a noisy autoencoder, add GaussianNoise layers to the start of the encoder self.model.add(GaussianNoise(noise, input_shape=(input_size, ))) self.model.add( Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation, W_regularizer=l2(reg))) else: # Otherwise just add the hidden layer self.model.add( Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation, W_regularizer=l2(reg))) # If using custom weights on the hidden layer to the output layer, apply those custom weights. Otherwise just add output layer. if output_weights == None: self.model.add( Dense(output_dim=output_size, init=layer_init, activation=output_activation)) else: self.model.add( Dense(output_dim=len(output_weights[0]), init=layer_init, activation=output_activation, weights=output_weights)) # Compile the model and fit it to the data if optimizer_name == "sgd": optimizer = SGD(lr=learn_rate, decay=decay) elif optimizer_name == "rmsprop": optimizer = RMSprop(lr=learn_rate) self.model.compile(loss=loss, optimizer=optimizer) self.model.fit(movie_vectors, movie_vectors, nb_epoch=epochs, batch_size=batch_size, verbose=1) # Create a truncated model that has no output layer that has the same weights as the previous model and use it to obtain the hidden layer representation truncated_model = Sequential() total_file_name = "newdata/spaces/" + file_name + ".mds" truncated_model.add(GaussianNoise(noise, input_shape=(input_size, ))) truncated_model.add( Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation, W_regularizer=l2(reg))) truncated_model.compile(loss=loss, optimizer=optimizer) self.end_space = truncated_model.predict(movie_vectors) np.save(self.end_space, total_file_name)
def createTermClusters(self, hv_directions, lv_directions, hv_names, lv_names): least_similar_clusters = [] least_similar_cluster_ids = [] least_similar_cluster_names = [] directions_to_add = [] names_to_add = [] print "Overall amount of HV directions: ", len(hv_directions) # Create high-valued clusters amt_of_clusters = len(hv_directions[0]) * 2 for i in range(len(hv_directions)): if i == 0: least_similar_cluster_ids.append(i) least_similar_clusters.append(hv_directions[i]) least_similar_cluster_names.append(hv_names[i]) print "Least Similar Term", hv_names[i] elif i >= amt_of_clusters: directions_to_add.append(hv_directions[i]) names_to_add.append(hv_names[i]) print "Added", hv_names[ i], "To the remaining directions to add" else: ti = getNextClusterTerm(least_similar_clusters, hv_directions, least_similar_cluster_ids, 1) least_similar_cluster_ids.append(ti) least_similar_clusters.append(hv_directions[ti]) least_similar_cluster_names.append(hv_names[ti]) print str(i + 1) + "/" + str( amt_of_clusters), "Least Similar Term", hv_names[ti] # Add remaining high value directions to the low value direction list directions_to_add.reverse() names_to_add.reverse() for i in range(len(directions_to_add)): lv_directions.insert(0, directions_to_add[i]) lv_names.insert(0, names_to_add[i]) # Initialize dictionaries for printing / visualizing cluster_name_dict = OrderedDict() for c in least_similar_cluster_names: cluster_name_dict[c] = [] # For every low value direction, find the high value direction its most similar to and append it to the directions every_cluster_direction = [] for i in least_similar_clusters: every_cluster_direction.append([i]) # Reversing so that the top names and directions are first lv_names.reverse() lv_directions.reverse() # Finding the most similar directions to each cluster_centre # Creating a dictionary of {cluster_centre: [cluster_direction(1), ..., cluster_direction(n)]} pairs for d in range(len(lv_directions)): i = getXMostSimilarIndex(lv_directions[d], least_similar_clusters, [], 1)[0] every_cluster_direction[i].append(lv_directions[d]) print str(d + 1) + "/" + str( len(lv_directions)), "Most Similar to", lv_names[ d], "Is", least_similar_cluster_names[i] cluster_name_dict[least_similar_cluster_names[i]].append( lv_names[d]) # Mean of all directions = cluster direction cluster_directions = [] for l in range(len(least_similar_clusters)): cluster_directions.append( dt.mean_of_array(every_cluster_direction[l])) """ # Get the 10 most similar and least similar directions to save later most_similar = [] least_similar = [] most_similar_indexes = [] least_similar_indexes = [] indexes_to_find = [] for k in sorted(cluster_amt_dict, key=cluster_amt_dict.get, reverse=True): name_to_get_most_similar = k index_to_find = dt.getIndexInArray(hv_names, name_to_get_most_similar) amt = 10 indexes_to_find.append(index_to_find) most_similar_index = getXMostSimilarIndex(hv_directions[index_to_find], hv_directions, [index_to_find], amt) least_similar_index = getXLeastSimilarIndex(hv_directions[index_to_find], hv_directions, [index_to_find], amt) most_similar_indexes.append(most_similar_index) least_similar_indexes.append(least_similar_index) for m in range(len(most_similar_indexes)): line_to_append = [] for v in range(len(most_similar_indexes[m])): line_to_append.append(hv_names[most_similar_indexes[m][v]]) most_similar.append([cluster_dict_names[m][0], line_to_append]) for l in range(len(least_similar_indexes)): line_to_append = [] for v in range(len(least_similar_indexes[l])): line_to_append.append(hv_names[least_similar_indexes[l][v]]) least_similar.append([cluster_dict_names[l][0], line_to_append]) """ return least_similar_cluster_names, cluster_name_dict, cluster_directions
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: if indexes_to_get is not []: for i in indexes_to_get: if i == counter: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt rho, pvalue = spearmanr(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) averages.append(average) print phrases[counter] + ":", rho, pvalue, average else: direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) rho, pvalue = spearmanr(direction_rank, ppmi_rank) scores.append(rho) pvalues.append(pvalue) print phrases[counter] + ":", rho, pvalue counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages")
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) indexes_to_get.sort() ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] agini = [] agini1 = [] angini1 = [] angini = [] amap = [] andcg = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: exists = True if phrases_to_check_fn != "": exists = False for i in indexes_to_get: if i == counter: exists = True break if exists: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt min_max_scaler = preprocessing.MinMaxScaler() normalized_ppmi = min_max_scaler.fit_transform( ppmi[counter]) normalized_dir = min_max_scaler.fit_transform(direction) ginis = gini(normalized_ppmi, normalized_dir) ranked_ppmi = dt.sortByArray(new_ppmi, new_direction) nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi) ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi)) #binarizer = preprocessing.Binarizer() #binary_ppmi = binarizer.transform(normalized_ppmi) #normalized_dir = np.ndarray.tolist(normalized_dir) map = 0 #average_precision_score(normalized_ppmi, normalized_dir) rho, pvalue = spearmanr(new_ppmi, new_direction) rhok, pvaluek = kendalltau(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) andcg.append(ndcgs) agini.append(ginis) amap.append(map) averages.append(average) print phrases[counter] + ":", map, ginis counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages") dt.write1dArray(agini, "RuleType/gn" + fn + ".score") dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score") dt.write1dArray(amap, "RuleType/map" + fn + ".score")