def getMatchedLines(file_name, lines_to_match): matched_lines = [] failed_lines = [] match_names = [] match_years = [] for line in lines_to_match: match_names.append(re.split(r'\t+', line)[0]) match_years.append(re.split(r'\t+', line)[1]) file = open(file_name, "r") lines = file.readlines() for i in range(len(lines_to_match)): matched = False last_movie = "" for l in range(len(lines)): if matched is True and re.split(r'\t+', lines[l])[0] != last_movie: break split_line = re.split(r'\t+', lines[l]) split_line[0] = re.sub(r'\s+', '', split_line[0].translate(None, string.punctuation).lower()) match_names[i] = re.sub(r'\s+', '', match_names[i].translate(None, string.punctuation).lower()) if split_line[0] == match_names[i]: matched_lines.append(lines[l]) matched = True last_movie = re.split(r'\t+', lines[l])[0] print "Found a line for " + last_movie continue if matched: print "Matched", lines_to_match[i] else: failed_lines.append(lines_to_match[i]) print "Failed", lines_to_match[i] dt.write1dArray(failed_lines, "filmdata/KeywordData/failed_second_match.txt") dt.write1dArray(matched_lines, "filmdata/KeywordData/matched_lines_NEW.txt")
def getScoreDifferences(name_word_file1, name_score_file1, name_word_file2, name_score_file2, name): word_file1 = open(name_word_file1, "r") score_file1 = open(name_score_file1, "r") word_lines1 = word_file1.readlines() score_lines1 = score_file1.readlines() scores1 = [] words1 = [] for s in score_lines1: scores1.append(float(s.strip())) for w in word_lines1: words1.append(w.strip()) word_file2 = open(name_word_file2, "r") score_file2 = open(name_score_file2, "r") word_lines2 = word_file2.readlines() score_lines2 = score_file2.readlines() scores2 = [] words2 = [] for s in score_lines2: scores2.append(float(s)) for w in word_lines2: words2.append(w.strip()) differences_list = [] for i in range(len(score_lines1)): differences_list.append(scores1[i] - scores2[i]) most_different_words = [x for (y,x) in sorted(zip(differences_list,words1))] differences_list = sorted(differences_list) dt.write1dArray(most_different_words, "filmdata/SVM/most_different_words_" + name + ".txt") dt.write1dArray(differences_list, "filmdata/SVM/most_different_values_" + name + ".txt")
def outputTopByVotes(amount_of_votes): file = open("filmdata/ratings.list/ratings.list", "r") lines = file.readlines() top_movies = [] top_ratings = [] for line in lines: top_ratings.append(int(line.split()[1])) top_ratings = np.asarray(top_ratings) indices = np.argpartition(top_ratings, -amount_of_votes)[-amount_of_votes:] for i in indices: just_movie = lines[i].split()[3:] just_movie = " ".join(just_movie) just_movie = just_movie.split('{') just_movie = just_movie[0] just_movie = just_movie.split('(') try: if not re.findall(r'\d+', just_movie[2])[0]: del just_movie[2] else: just_movie[0] = just_movie[0] + "(" + just_movie[1] del just_movie[1] except IndexError: print try: year = re.findall(r'\d+', just_movie[1])[0] except IndexError: print "FALED", just_movie if just_movie[0].endswith(' '): just_movie[0] = just_movie[0][:-1] if just_movie[0].startswith('"') and just_movie[0].endswith('"'): just_movie[0] = just_movie[0][1:-1] just_movie = just_movie[0] + " " + str(year) print just_movie top_movies.append(just_movie) dt.write1dArray(top_movies, "filmdata/top50000moviesbyvotes.txt")
def getVectors(ordered_IDs, unique_phrases): vectors = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))] vectors_maintained = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))] multi_dictionary = {} dict_mapping = {} print "Mapping to memory." for i in range(len(ordered_IDs)): ordered_IDs[i] = str(ordered_IDs[i]) for i in range(len(ordered_IDs)): ordered_IDs[i] = ordered_IDs[i].strip() dict_mapping[ordered_IDs[i]] = i if ordered_IDs[i] != "-1": file = open("filmdata/Tokens/" + ordered_IDs[i] + ".film", "r") lines = file.readlines()[1:] for line in lines: split_line = line.split() multi_dictionary[(ordered_IDs[i], split_line[0])] = int(split_line[1]) file.close() else: multi_dictionary[(ordered_IDs[i], split_line[0])] = 0 for up in range(len(unique_phrases)): unique_phrases[up] = unique_phrases[up].strip() print len("Iterating over memory.") for p in range(13177, 25842, 1): for key, value in multi_dictionary.iteritems(): if key[1] == unique_phrases[p]: vectors_maintained[p][dict_mapping[key[0].strip()]] = value vectors[p][dict_mapping[key[0]]] = 1 print unique_phrases[p] dt.write1dArray(vectors_maintained[p], "filmdata/classesPhrases/nonbinary/class-" + unique_phrases[p]) dt.write1dArray(vectors[p], "filmdata/classesPhrases/class-" + unique_phrases[p]) return vectors_maintained, vectors
def getVectorsKeywords(movie_strings, keywords): multi_dictionary = {} dict_mapping = {} movie_names = [] file_names = dt.getAllFileNames("filmdata\KeywordData\Movie_Most_Common_Keyword_Mapping") for i in movie_strings: movie_names.append(i.strip()[:-5]) print i print "Mapping to memory." for i in file_names: try: file = open("filmdata/KeywordData/Movie_Most_Common_Keyword_Mapping/" +i, "r") lines = file.readlines() dict_mapping[movie_strings[int(i)]] = i for line in lines: line = line.strip() multi_dictionary[(movie_strings[int(i)], line)] = 1 file.close() except IOError: print movie_names[i] for up in range(len(keywords)): keywords[up] = keywords[up].strip() print len("Iterating over memory.") for p in range(len(keywords)): vector = [0 for x in range(len(movie_strings))] print len(vector) for key, value in multi_dictionary.iteritems(): if key[1] == keywords[p]: #print int(dict_mapping[key[0]]) vector[int(dict_mapping[key[0]])] = 1 print keywords[p] dt.write1dArray(vector, "filmdata/classesKeywords/NewData/class-" + keywords[p])
def findMissingKeywords(file_name, common_keywords): print "?" file = open(file_name, "r") lines = file.readlines() last_film = "" movie_strings = dt.importString("filmdata/filmNames.txt") standard_strings = [] indexes = [] for m in movie_strings: m = m[:-5] standard_strings.append(m.translate(None, string.punctuation).replace(" ", "").strip().upper()) for line in lines: film_vectors = [] line = line.strip() if len(line) > 2: line_split = re.split(r'\t+', line) line_split[0] = line_split[0].translate(None, string.punctuation).replace(" ", "").strip().upper() file_save = "" for m in range(len(standard_strings)): if standard_strings[m] == line_split[0]: print "matched", m, standard_strings[m], line_split[0] file_save = str(m) break if file_save != "": if last_film.strip() != line_split[0].strip() and last_film is not None: print "Succeeded", line_split[0] for m in range(len(standard_strings)): if standard_strings[m] == last_film: indexes.append(m) break last_film = line_split[0] else: print "Failed", line_split[0], dt.write1dArray(indexes, "filmdata/MISSING_FROM_MOVIEDATA.txt")
def getIDs(movie_strings): ordered_IDs = [] movie_names = [] for name in movie_strings: movie_names.append(name[:-5]) id_mappings = open("filmdata/KeywordData/Movie_Most_Common_Keyword_Mapping/films-ids.txt", "r") id_mappings_lines = id_mappings.readlines() found_name = False failed_names = [] x = 0 for name in movie_names: for line in id_mappings_lines: mapping_id = line.split()[0] mapping_name = re.split(r'\t+', line)[2] if similar(name.upper().strip(), mapping_name.upper().strip()): ordered_IDs.append(mapping_id) found_name = True break if found_name is True: found_name = False else: failed_names.append(name) ordered_IDs.append(-1) x += 1 print x dt.write1dArray(failed_names, "filmdata/KeywordData/NAMES_THAT_FAILED_IDS.txt") dt.write1dArray(ordered_IDs, "filmdata/KeywordData/IDsByOriginalOrdering.txt")
def outputKeywords(): movie_strings = dt.importString("filmdata/filmNames.txt") movie_data = getMovieDataFromIMDB(movie_strings) commonality = 0 common_keywords = getMostCommonKeywords(0, "filmdata/IMDB_movie_data.txt") dt.write1dArray(common_keywords, "filmdata/common_keywords_15k_commanility_" + str(commonality)) vectors = getKeywordVectors(common_keywords, movie_strings, "") dt.write2dArray(vectors, "filmdata/classesKeywords/class-extra-all-commonality-" + str(commonality))
def trimMovieData(file_string): new_movie_data = [] movie_data_file = open(file_string) with movie_data_file as myFile: for num, line in enumerate(myFile, 1): if "{" not in line and "(V)" not in line and "(TV)" not in line and "(VG)" not in line and len(line) > 2 and line.startswith(" ") is False and line.startswith("\t") is False and line.startswith("\n") is False: new_movie_data.append(line[:-1]) print "dun" dt.write1dArray(new_movie_data, file_string + "trimmed")
def getMissingIndexes(index_list, length): full_index = range(length) for i in index_list: i = int(i) full_index[i] = -1 missing_indexes = [] for i in full_index: if i > -1: missing_indexes.append(i) dt.write1dArray(missing_indexes, "filmdata/missing_indexes_keywords.txt")
def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn, label_names_fn, cluster_names_fn, filename, training_data, cluster_to_classify, max_depth): vectors = dt.importVectors(cluster_vectors_fn) labels = dt.importLabels(cluster_labels_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(movie_names_fn) label_names = dt.importString(label_names_fn) scores_array = [] for l in range(len(labels[0])): new_labels = [0] * 15000 for x in range(len(labels)): new_labels[x] = labels[x][l] x_train = np.asarray(vectors[:training_data]) x_test = np.asarray(vectors[training_data:]) y_train = np.asarray(new_labels[:training_data]) y_test = np.asarray(new_labels[training_data:]) self.clf = tree.DecisionTreeClassifier(max_depth=max_depth) self.clf = self.clf.fit(x_train, y_train) y_pred = self.clf.predict(x_test) f1 = f1_score(y_test, y_pred, average='binary') accuracy = accuracy_score(y_test, y_pred) scores = [[label_names[l], "f1", f1, "accuracy", accuracy]] print scores[0] scores_array.append(scores) class_names = [label_names[l], "NOT " + label_names[l]] tree.export_graphviz(self.clf, feature_names=cluster_names, class_names=class_names, out_file='Rules/' + label_names[l] + filename + '.dot', max_depth=10) """ rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot') new_dot_file = [] for s in rewrite_dot_file: new_string = s if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s: index = s.index("value") new_string = s[:index] + '"] ;' new_dot_file.append(new_string) dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot') """ graph = pydot.graph_from_dot_file('Rules/' + label_names[l] + filename + '.dot') graph.write_png('Rules/Images/' + label_names[l] + filename + ".png") self.get_code(self.clf, cluster_names, class_names, label_names[l] + filename) dt.write1dArray(scores_array, 'Rules/Scores/' + filename + '.scores')
def __init__(self, low_threshold, high_threshold, filename): hdn, ldn, hd, ld = self.splitDirections("Directions/"+filename+".directions", "SVMResults/"+filename+".scores", "SVMResults/"+filename+".names", low_threshold, high_threshold) least_similar_cluster_names, cluster_name_dict, least_similar_clusters = self.createTermClusters(hd, ld, hdn, ldn) dt.write1dArray(least_similar_cluster_names, "Clusters/"+filename+"LeastSimilarHIGH"+str(high_threshold)+","+str(low_threshold)+".names") dt.write2dArray(least_similar_clusters, "Clusters/"+filename+"LeastSimilarHIGH"+str(high_threshold)+","+str(low_threshold)+".clusters") dt.writeArrayDict(cluster_name_dict, "Clusters/"+filename+"MostSimilarCLUSTER"+str(high_threshold)+","+str(low_threshold)+".names")
def getIMDBKeywordsForMovieNames(movie_names): stripped_movie_names = [] for movie in movie_names: stripped_movie_names.append(movie.replace('\n', '')) stripped_movie_names = sorted(stripped_movie_names) split_names = [] split_years = [] for stripped_movie_name in stripped_movie_names: split = stripped_movie_name.split() split_year = split[len(split)-1] split_years.append(split_year) split_names.append(stripped_movie_name[:-len(split_year)-1]) file = open("filmdata\keywords.list\keywords.list", "r") lines = file.readlines() keywords_list = lines[79748:] matched_lines = [] x = 0 last_line = keywords_list[0] matched = False while x < 50000: for line in keywords_list: split_line = line.rsplit('(', 2) movie_name = split_line[0].rstrip() if movie_name.startswith('"') and movie_name.endswith('"'): movie_name = movie_name[1:-1] try: movie_year = str(re.findall(r'\d+', split_line[1])[0]) except IndexError: movie_year = "NULL" if not movie_name: movie_name = "'NULL" formatted_line = movie_name.rstrip() + " " + str(movie_year).rstrip() if matched is True and formatted_line == last_line: matched_lines.append(line) print split_names[x], line elif matched is False and similar(movie_name.strip().upper(), split_names[x].strip().upper()) and movie_year == split_years[x]: matched = True matched_lines.append(line) print split_names[x], line elif matched is True and formatted_line != last_line: matched = False x = x + 1 last_line = formatted_line print "cycled through" print "Found:", x dt.write1dArray(matched_lines, "filmdata/imdb_movie_keywords.txt")
def makeConsistent(file_name, new_file_name): new_file = [] with open(file_name) as my_file: for num, line in enumerate(my_file, 1): line = line.strip() name = line[:-4] year = line[len(line)-4:] name = name.translate(None, string.punctuation) year = year.translate(None, string.punctuation) new_line = "\t".join([name, year]) new_file.append(new_line) print new_line dt.write1dArray(new_file, new_file_name)
def reMapPPMI(ordered_IDs, file_names): print "Mapping to memory." for i in range(len(ordered_IDs)): ordered_IDs[i] = str(ordered_IDs[i]) for i in range(len(ordered_IDs)): for f in range(len(file_names)): id = file_names[f].split(".")[0] if int(ordered_IDs[i]) == int(id) and int(ordered_IDs[i]) != -1: print ordered_IDs[i], id file = open("filmdata/vectors/Tokens/" + file_names[f]) lines = file.readlines() dt.write1dArray(lines, "filmdata/NewTokens/"+str(i)+".ppmi") file.close() elif int(ordered_IDs[i]) == -1: dt.write1dArray([[""]], "filmdata/NewTokens/"+str(i)+".error")
def getUnformattedTopByVotes(amount_of_votes): file = open("filmdata/ratings.list/ratings.list", "r") lines = file.readlines() top_movies = [] top_ratings = [] for line in lines: top_ratings.append(int(line.split()[1])) top_ratings = np.asarray(top_ratings) indices = np.argpartition(top_ratings, -amount_of_votes)[-amount_of_votes:] for i in indices: just_movie = lines[i].split()[3:] just_movie = " ".join(just_movie) print just_movie top_movies.append(just_movie) dt.write1dArray(top_movies, "filmdata/imdb_formatted_top50000.txt")
def getKNeighbors(vector_path="filmdata/films200.mds/films200.mds", class_path="filmdata/classesGenres/class-All", n_neighbors=1, algorithm="kd_tree", leaf_size=30, training_data=10000, name="normal200"): movie_vectors = np.asarray(dt.importVectors(vector_path)) movie_labels = np.asarray(dt.importLabels(class_path)) x_train, y_train, x_test, y_test = dt.splitData(training_data, movie_vectors, movie_labels) classifier = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size) classifier.fit(x_train, y_train.ravel()) y_pred = classifier.predict(x_test) f1 = f1_score(y_test, y_pred, average='macro') accuracy = accuracy_score(y_test, y_pred) dt.write1dArray([f1, accuracy], "KNNScores/" + name + ".score") print "F1 " + str(f1), "Accuracy", accuracy
def writeMissing(folder_name): print "?" file_names = dt.getAllFileNames(folder_name) standard = range(15000) missing = [] for i in standard: found = False for f in file_names: if int(f) == int(i): found = True break if found: print "found", i else: missing.append(i) print "no found", i dt.write1dArray(missing, "filmdata/MISSING_KEYWORD_ITEMS.txt")
def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn, label_names_fn, cluster_names_fn, filename, training_data, cluster_to_classify, max_depth): vectors = dt.importVectors(cluster_vectors_fn) labels = dt.importLabels(cluster_labels_fn) cluster_names = dt.importString(cluster_names_fn) vector_names = dt.importString(movie_names_fn) label_names = dt.importString(label_names_fn) scores_array = [] for l in range(len(labels[0])): new_labels = [0] * 15000 for x in range(len(labels)): new_labels[x] = labels[x][l] x_train = np.asarray(vectors[:training_data]) x_test = np.asarray(vectors[training_data:]) y_train = np.asarray(new_labels[:training_data]) y_test = np.asarray(new_labels[training_data:]) self.clf = tree.DecisionTreeClassifier( max_depth=max_depth) self.clf = self.clf.fit(x_train, y_train) y_pred = self.clf.predict(x_test) f1 = f1_score(y_test, y_pred, average='binary') accuracy = accuracy_score(y_test, y_pred) scores = [[label_names[l], "f1", f1, "accuracy", accuracy]] print scores[0] scores_array.append(scores) class_names = [ label_names[l], "NOT "+label_names[l]] tree.export_graphviz(self.clf, feature_names=cluster_names, class_names=class_names, out_file='Rules/'+label_names[l]+filename+'.dot', max_depth=10) """ rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot') new_dot_file = [] for s in rewrite_dot_file: new_string = s if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s: index = s.index("value") new_string = s[:index] + '"] ;' new_dot_file.append(new_string) dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot') """ graph = pydot.graph_from_dot_file('Rules/'+label_names[l]+filename+'.dot') graph.write_png('Rules/Images/'+label_names[l]+filename+".png") self.get_code(self.clf, cluster_names, class_names, label_names[l]+filename) dt.write1dArray(scores_array, 'Rules/Scores/'+filename+'.scores')
def get_code(self, tree, feature_names, class_names, filename): left = tree.tree_.children_left right = tree.tree_.children_right threshold = tree.tree_.threshold value = tree.tree_.value #print tree.tree_.feature, len(tree.tree_.feature # ) features = [] for i in tree.tree_.feature: if i != -2 or i <= 200: features.append(feature_names[i]) rules_array = [] def recurse(left, right, threshold, features, node): if (threshold[node] != -2): line = "IF ( " + features[node] + " <= " + str( threshold[node]) + " ) {" rules_array.append(line) if left[node] != -1: recurse(left, right, threshold, features, left[node]) line = "} ELSE {" rules_array.append(line) if right[node] != -1: recurse(left, right, threshold, features, right[node]) line = "}" rules_array.append(line) else: if value[node][0][0] >= value[node][0][1]: line = "return", class_names[0] rules_array.append(line) else: line = "return", class_names[1] rules_array.append(line) recurse(left, right, threshold, features, 0) dt.write1dArray(rules_array, "Rules/Statements/" + filename + ".rules") cleaned = jsbeautifier.beautify_file("Rules/Statements/" + filename + ".rules") file = open("Rules/Statements/" + filename + ".rules", "w") file.write(cleaned) file.close()
def getMostCommonKeywords(top_value, file_name, keyword_file, value_file): common_keywords = [] file = open(file_name, "r") lines = file.readlines() keywords = defaultdict(int) for line in lines: if len(line.split()) > 0: line_split = line.split() keyword = line_split[len(line_split)-1] keywords[keyword] += 1 print line sorted_dict = sorted(keywords.iteritems(), key=lambda x:-x[1])[:top_value] print sorted_dict keys = [] values = [] for key, value in sorted_dict: keys.append(key) values.append(value) dt.write1dArray(keys, keyword_file) dt.write1dArray(values, value_file)
def __init__(self, low_threshold, high_threshold, filename): hdn, ldn, hd, ld = self.splitDirections( "Directions/" + filename + ".directions", "SVMResults/" + filename + ".scores", "SVMResults/" + filename + ".names", low_threshold, high_threshold) least_similar_cluster_names, cluster_name_dict, least_similar_clusters = self.createTermClusters( hd, ld, hdn, ldn) dt.write1dArray( least_similar_cluster_names, "Clusters/" + filename + "LeastSimilarHIGH" + str(high_threshold) + "," + str(low_threshold) + ".names") dt.write2dArray( least_similar_clusters, "Clusters/" + filename + "LeastSimilarHIGH" + str(high_threshold) + "," + str(low_threshold) + ".clusters") dt.writeArrayDict( cluster_name_dict, "Clusters/" + filename + "MostSimilarCLUSTER" + str(high_threshold) + "," + str(low_threshold) + ".names")
def __init__(self, name_distinction="", class_names=None, vector_path=None, class_path=None, class_by_class=True, input_size=200, training_data=10000, amount_of_scores=400, low_kappa=0.1, high_kappa=0.5, rankSVM=False, amount_to_cut_at=100, largest_cut=21470000): print "getting movie data" movie_vectors = dt.importVectors(vector_path) movie_labels = dt.importLabels(class_path) print "getting file names" file_names = dt.getFns(class_path[:-10]) print len(movie_labels), len(movie_labels[0]) print "getting training and test data" x_train = np.asarray(movie_vectors[:training_data]) x_test = np.asarray(movie_vectors[training_data:]) movie_labels = zip(*movie_labels) file_names, movie_labels = self.getSampledData(file_names, movie_labels, amount_to_cut_at, largest_cut) movie_labels = zip(*movie_labels) y_train = movie_labels[:training_data] y_test = movie_labels[training_data:] y_train = np.asarray(zip(*y_train)) y_test = np.asarray(zip(*y_test)) print len(y_train), len(y_test), training_data print "getting kappa scores" kappa_scores, directions = self.runAllSVMs(y_test, y_train, x_train, x_test, file_names) dt.write1dArray(kappa_scores, "SVMResults/"+name_distinction+".scores") dt.write1dArray(file_names, "SVMResults/"+name_distinction+".names") dt.write2dArray(directions, "directions/"+name_distinction+".directions")
def getMovieDataFromIMDB(movie_strings): movie_data = [] write_line_file = open("filmdata/Found_Missing.txt", "w") failed_movies = [] names = [] years = [] for movie_string in movie_strings: names.append(movie_string[:-6]) years.append(movie_string[-5:].strip()) for n in range(len(names)): found = False last_name = "" old_num = 0 with open("filmdata/IMDB_movie_data.txttrimmed") as myFile: for num, line in enumerate(myFile, 1): num = old_num split_line = re.split(r'\t+', line) movie_string = split_line[0] movie_name = movie_string.split() del movie_name[len(movie_name)-1] movie_name = " ".join(movie_name) if found is True and last_name != movie_name: break movie_year = int(re.findall(r'\d+', movie_string.split()[len(movie_string.split())-1])[0]) if similar(names[n].upper().strip(), movie_name.upper().strip()) and int(years[n]) == int(movie_year): movie_data.append(line) write_line_file.write(line) found = True old_num = num last_name = movie_name if found is False: failed_movies.append(names[n]) print "FAILED:", names[n] print "Total failed", len(failed_movies) write_line_file.close() dt.write1dArray(failed_movies, "filmdata/failed_movies_final_push.txt") return movie_data
def get_code(self, tree, feature_names, class_names, filename): left = tree.tree_.children_left right = tree.tree_.children_right threshold = tree.tree_.threshold value = tree.tree_.value #print tree.tree_.feature, len(tree.tree_.feature # ) features = [] for i in tree.tree_.feature: if i != -2 or i <= 200: features.append(feature_names[i]) rules_array = [] def recurse(left, right, threshold, features, node): if (threshold[node] != -2): line = "IF ( " + features[node] + " <= " + str(threshold[node]) + " ) {" rules_array.append(line) if left[node] != -1: recurse (left, right, threshold, features,left[node]) line = "} ELSE {" rules_array.append(line) if right[node] != -1: recurse (left, right, threshold, features,right[node]) line = "}" rules_array.append(line) else: if value[node][0][0] >= value[node][0][1]: line = "return", class_names[0] rules_array.append(line) else: line = "return", class_names[1] rules_array.append(line) recurse(left, right, threshold, features, 0) dt.write1dArray(rules_array, "Rules/Statements/"+filename+".rules") cleaned = jsbeautifier.beautify_file("Rules/Statements/"+filename+".rules") file = open("Rules/Statements/"+filename+".rules", "w") file.write(cleaned) file.close()
def makeConsistentKeywords(file_name, new_file_name): new_file = [] with open(file_name) as my_file: for num, line in enumerate(my_file, 1): if "{" in line or "?" in line: continue split_line = re.split(r'\t+', line) split_on_bracket = split_line[0].split(" (") if split_on_bracket[1].startswith("1") == False and split_on_bracket[1].startswith("2") == False: year = split_on_bracket[2][:4] name = "".join([split_on_bracket[0], split_on_bracket[1]]) else: year = split_on_bracket[1][:4] name = split_on_bracket[0] name = name.translate(None, string.punctuation) year = year.translate(None, string.punctuation) keyword = re.sub(r'\s+', '', split_line[1]).translate(None, string.punctuation) name_and_year = "\t".join([name, year]) new_line = "\t".join([name_and_year, keyword]) new_file.append(new_line) print new_line dt.write1dArray(new_file, new_file_name)
def getVectorsIO(ordered_IDs, unique_phrases): vectors = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))] vectors_maintained = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))] for p in range(11212, 25842, 1): unique_phrases[p] = unique_phrases[p].strip() for i in range(len(ordered_IDs)): ordered_IDs[i] = ordered_IDs[i].strip() if ordered_IDs[i] != "-1": file = open("filmdata/Tokens/" + ordered_IDs[i] + ".film", "r") lines = file.readlines()[1:] for line in lines: split_line = line.split() split_line[1] = split_line[1].strip() if split_line[0] == p: vectors_maintained[p][i] = split_line[1] vectors[p][i] = 1 file.close() print unique_phrases[p] dt.write1dArray(vectors_maintained[p], "filmdata/classesPhrases/nonbinary/class-" + unique_phrases[p]) dt.write1dArray(vectors[p], "filmdata/classesPhrases/class-" + unique_phrases[p]) return vectors_maintained, vectors
def getKNeighbors(vector_path="filmdata/films200.mds/films200.mds", class_path="filmdata/classesGenres/class-All", n_neighbors=1, algorithm="kd_tree", leaf_size=30, training_data=10000, name="normal200"): movie_vectors = np.asarray(dt.importVectors(vector_path)) movie_labels = np.asarray(dt.importLabels(class_path)) x_train, y_train, x_test, y_test = dt.splitData(training_data, movie_vectors, movie_labels) classifier = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size) classifier.fit(x_train, y_train.ravel()) y_pred = classifier.predict(x_test) f1 = f1_score(y_test, y_pred, average='macro') accuracy = accuracy_score(y_test, y_pred) dt.write1dArray([f1, accuracy], "KNNScores/" + name + ".score") print "F1 " + str(f1), "Accuracy", accuracy
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: if indexes_to_get is not []: for i in indexes_to_get: if i == counter: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt rho, pvalue = spearmanr(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) averages.append(average) print phrases[counter] + ":", rho, pvalue, average else: direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) rho, pvalue = spearmanr(direction_rank, ppmi_rank) scores.append(rho) pvalues.append(pvalue) print phrases[counter] + ":", rho, pvalue counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages")
#reMapPPMI(dt.importString("filmdata/filmIds.txt"), dt.getFns("filmdata/vectors/tokens/")) filenames = ["AUTOENCODER0.5tanhtanhmse15tanh[1000]4SDA1","AUTOENCODER0.5tanhtanhmse60tanh[200]4SDA2","AUTOENCODER0.5tanhtanhmse30tanh[1000]4SDA3", "AUTOENCODER0.5tanhtanhmse60tanh[200]4SDA4"] """ path = "newdata/spaces/" id = 155 for f in filenames: movie_vectors = dt.getMovieVectors(input_size=200, vector_path=path+f+".mds") nearest_movies, nearest_distances = getKNearestMovies(movie_vectors, movie_vectors[id], 30) dt.write1dArray(nearest_movies, "KDNearest/" + f + str(id)+".knmovies") dt.write1dArray(nearest_distances, "KDNearest/" + f + str(id)+".kndistances") """ """ #makeConsistent("filmdata/KeywordData/Matched_Films.txt", "filmdata/KeywordData/Matched_Films_Normalised.txt") #getMatchedLines("filmdata/KeywordData/All_Films_Norm_Spaces.txt", dt.importString("filmdata/KeywordData/Missing_Films_Normalised.txt")) """ """ movie_strings = dt.importString("filmdata/filmNames.txt") missing_items = getMissing("filmdata/IMDB Keywords Movie Data/Matched_Films.txt", movie_strings) dt.write1dArray(missing_items, "filmdata/missing_films.txt") """ #outputPhrases() #outputKeywords()
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) indexes_to_get.sort() ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] agini = [] agini1 = [] angini1 = [] angini = [] amap = [] andcg = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: exists = True if phrases_to_check_fn != "": exists = False for i in indexes_to_get: if i == counter: exists = True break if exists: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt min_max_scaler = preprocessing.MinMaxScaler() normalized_ppmi = min_max_scaler.fit_transform(ppmi[counter]) normalized_dir = min_max_scaler.fit_transform(direction) ginis = gini(normalized_ppmi, normalized_dir) ranked_ppmi = dt.sortByArray(new_ppmi, new_direction) nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi) ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi)) #binarizer = preprocessing.Binarizer() #binary_ppmi = binarizer.transform(normalized_ppmi) #normalized_dir = np.ndarray.tolist(normalized_dir) map = 0#average_precision_score(normalized_ppmi, normalized_dir) rho, pvalue = spearmanr(new_ppmi, new_direction) rhok, pvaluek = kendalltau(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) andcg.append(ndcgs) agini.append(ginis) amap.append(map) averages.append(average) print phrases[counter] + ":", map, ginis counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages") dt.write1dArray(agini, "RuleType/gn" + fn + ".score") dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score") dt.write1dArray(amap, "RuleType/map" + fn + ".score")
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: if indexes_to_get is not []: for i in indexes_to_get: if i == counter: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt rho, pvalue = spearmanr(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) averages.append(average) print phrases[counter] + ":", rho, pvalue, average else: direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) rho, pvalue = spearmanr(direction_rank, ppmi_rank) scores.append(rho) pvalues.append(pvalue) print phrases[counter] + ":", rho, pvalue counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages")
def removeGaps(file_name, new_file_name): new_file = [] with open(file_name) as my_file: for num, line in enumerate(my_file, 1): new_file.append(line.strip()) dt.write1dArray(new_file, new_file_name)
def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn): ppmi = dt.importLabels(ppmi_fn) ppmi = np.asarray(ppmi) phrases = dt.importString(phrases_fn) indexes_to_get = [] if phrases_to_check_fn != "": phrases_to_check = dt.importString(phrases_to_check_fn) for pc in range(len(phrases_to_check)): for p in range(len(phrases)): if phrases_to_check[pc] == phrases[p][6:]: indexes_to_get.append(p) indexes_to_get.sort() ppmi = ppmi.transpose() print len(ppmi), len(ppmi[0]) scores = [] pvalues = [] scores_kendall = [] pvalues_kendall = [] agini = [] agini1 = [] angini1 = [] angini = [] amap = [] andcg = [] counter = 0 averages = [] with open(direction_fn) as f: for line in f: exists = True if phrases_to_check_fn != "": exists = False for i in indexes_to_get: if i == counter: exists = True break if exists: total = 0 amt = 0 direction = line.split() for d in range(len(direction)): direction[d] = float(direction[d]) new_direction = [] new_ppmi = [] direction_rank = np.argsort(direction) ppmi_rank = np.argsort(ppmi[counter]) for d in range(len(ppmi[counter])): if ppmi[counter][d] != 0: total += ppmi[counter][d] amt += 1 new_direction.append(direction_rank[d]) new_ppmi.append(ppmi_rank[d]) average = total / amt min_max_scaler = preprocessing.MinMaxScaler() normalized_ppmi = min_max_scaler.fit_transform( ppmi[counter]) normalized_dir = min_max_scaler.fit_transform(direction) ginis = gini(normalized_ppmi, normalized_dir) ranked_ppmi = dt.sortByArray(new_ppmi, new_direction) nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi) ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi)) #binarizer = preprocessing.Binarizer() #binary_ppmi = binarizer.transform(normalized_ppmi) #normalized_dir = np.ndarray.tolist(normalized_dir) map = 0 #average_precision_score(normalized_ppmi, normalized_dir) rho, pvalue = spearmanr(new_ppmi, new_direction) rhok, pvaluek = kendalltau(new_ppmi, new_direction) scores.append(rho) pvalues.append(pvalue) scores_kendall.append(rhok) pvalues_kendall.append(pvaluek) andcg.append(ndcgs) agini.append(ginis) amap.append(map) averages.append(average) print phrases[counter] + ":", map, ginis counter += 1 dt.write1dArray(scores, "RuleType/s" + fn + ".score") dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue") dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score") dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue") dt.write1dArray(phrases, "RuleType/" + fn + ".names") dt.write1dArray(averages, "RuleType/" + fn + ".averages") dt.write1dArray(agini, "RuleType/gn" + fn + ".score") dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score") dt.write1dArray(amap, "RuleType/map" + fn + ".score")