def match_entities(entity_fn, t_entity_fn, entities_fn, classification): names = dt.import1dArray(entity_fn) t_names = dt.import1dArray(t_entity_fn) entities = dt.import2dArray(entities_fn) indexes_to_delete = [] amount_found = 0 for n in range(len(names)): names[n] = dt.removeEverythingFromString(names[n]) for n in range(len(t_names)): t_names[n] = dt.removeEverythingFromString(t_names[n]) matched_ids = [] for n in range(len(t_names)): for ni in range(len(names)): matched_name = t_names[n] all_name = names[ni] if matched_name == all_name: print(matched_name) matched_ids.append(ni) break matched_entities = [] for e in matched_ids: matched_entities.append(entities[e]) print("Amount found", amount_found) dt.write2dArray(matched_entities, entities_fn[:len(entities_fn)-4] + "-" + classification + ".txt")
def getTop250Movies(entity_names): imdb = dt.import1dArray("../data/raw/imdb/ratings/ratings.list")[28:278] orig_en = entity_names for e in range(len(entity_names)): entity_names[e] = "".join(entity_names[e].split()[:-1]) entity_names[e] = dt.removeEverythingFromString(entity_names[e]) top_en = [] for string in imdb: string =string.split(".")[1][1:] string =string.split()[:-1] string = " ".join(string) string = dt.removeEverythingFromString(string) top_en.append(string) matched_index = [] for e in range(len(entity_names)): for x in range(len(top_en)): if entity_names[e] == top_en[x]: matched_index.append(e) print(entity_names[e]) break dt.write1dArray(matched_index, "../data/movies/top_imdb_indexes.txt")
def writeClassesFromNames(folder_name, file_names, output_folder): names = dt.getFolder(folder_name) all_names = defaultdict(int) entity_names = dt.import1dArray(file_names) translator = str.maketrans({key: None for key in string.punctuation}) for type in range(len(names)): for n in range(len(names[type])): names[type][n] = dt.removeEverythingFromString(names[type][n]) all_names[names[type][n]] += 1 available_class_names = [] available_indexes = [] for n in range(len(entity_names)): name = entity_names[n] original_name = name name = dt.removeEverythingFromString(name) if all_names[name] > 0: available_class_names.append(original_name) available_indexes.append(n) print(name, "exists") else: print(name, "FAIL") dt.write1dArray(available_indexes, output_folder + "available_indexes.txt") dt.write1dArray(available_class_names, output_folder + "available_entities.txt") print("Wrote available indexes and entities") class_all = [] for c in range(len(names)): binary_class = [] for n in range(len(available_class_names)): available_class_names[n] = dt.removeEverythingFromString(available_class_names[n]) if available_class_names[n] in names[c]: binary_class.append(1) else: binary_class.append(0) dt.write1dArray(binary_class, output_folder + "class-"+str(c)+"") class_all.append(binary_class) dt.write2dArray(np.asarray(class_all).transpose(), output_folder + "class-all") print("Wrote class-all")
def importCertificates(cert_fn, entity_name_fn): all_lines = dt.import1dArray(cert_fn)[14:] en = dt.import1dArray(entity_name_fn) original_en = dt.import1dArray(entity_name_fn) en_name = [] en_year = [] for e in range(len(en)): split = en[e].split() en_year.append(split[len(split)-1]) name = "".join(split[:len(split)-1]) en_name.append(dt.removeEverythingFromString(name)) # Initialize ratings dict """ ratings = { "USA:G": [], "USA:PG": [], "USA:PG-13": [], "USA:R": [] } """ ratings = { "UK:PG": [], "UK:12": [], "UK:12A": [], "UK:15": [], "UK:18": [], } all_ratings = defaultdict(list) recently_found_name = "" recently_found_year = "" recently_found_found = False counter = 0 temp_fn = "../data/temp/uk_cert_dict.pickle" if dt.fileExists(temp_fn) is False: for line in all_lines: line = line.split("\t") split_ny = line[0].split("{")[0] split_ny = split_ny.split() for i in range(len(split_ny)-1, -1, -1): if "{" in split_ny[i]: del split_ny[i] entity_year_bracketed = split_ny[len(split_ny)-1] if "(V)" in entity_year_bracketed or "(TV)" in entity_year_bracketed or "(VG)" in entity_year_bracketed: entity_year_bracketed = split_ny[len(split_ny) - 2] try: entity_year = dt.keepNumbers(entity_year_bracketed)[0] entity_name = dt.removeEverythingFromString("".join(split_ny[:len(split_ny)-1])) found = False skip = False if recently_found_name == entity_name and recently_found_year == entity_year: skip = True found = recently_found_found if not skip: if not found: for n in range(len(en_name)): if entity_name == en_name[n] and entity_year == en_year[n]: print("found", entity_name, entity_year) found = True break if found: if("(" not in line[len(line)-1]): entity_rating = line[len(line)-1] else: entity_rating = line[len(line)-2] all_ratings[entity_rating].append(entity_name) if entity_rating in ratings: ratings[entity_rating].append(entity_name) print("rating correct", entity_name, entity_year, entity_rating) except IndexError: print("IndexError") print(line) print(split_ny) print(entity_year_bracketed) recently_found_name = entity_name recently_found_year = entity_year recently_found_found = found counter += 1 if counter % 1000 == 0: print(counter) # Store data (serialize) with open(temp_fn, 'wb') as handle: pickle.dump(ratings, handle, protocol=pickle.HIGHEST_PROTOCOL) # Store data (serialize) with open("../data/temp/uk_cert_dict_all.pickle", 'wb') as handle: pickle.dump(all_ratings, handle, protocol=pickle.HIGHEST_PROTOCOL) # Load data (deserialize) with open(temp_fn, 'rb') as handle: ratings = pickle.load(handle) if dt.fileExists("../data/temp/uk_cert_dict_all.pickle"): with open("../data/temp/uk_cert_dict_all.pickle", 'rb') as handle: all_ratings = pickle.load(handle) top_size = 0 for key, value in all_ratings.items(): top_size += len(value) print(top_size) top_size = 0 new_ratings = defaultdict(list) real_name_dict_fn = "../data/temp/uk_real_name_dict.dict" if dt.fileExists(real_name_dict_fn) is False: # Match the names back to the original names for key, value in all_ratings.items(): for r in ratings: if r == key: top_size += len(value) for v in range(len(value)): found = False for n in range(len(en_name)): if value[v] == en_name[n]: found = True value[v] = original_en[n] break if found: new_ratings[key].append(value[v]) break with open(real_name_dict_fn, 'wb') as handle: pickle.dump(new_ratings, handle, protocol=pickle.HIGHEST_PROTOCOL) else: with open(real_name_dict_fn, 'rb') as handle: new_ratings = pickle.load(handle) # Get the final dict setup """ final_dict = { "USA-G": [], "USA-PG-PG13": [], "USA-R": [], } """ final_dict = { "UK-PG": [], "UK-12-12A": [], "UK-15": [], "UK-18": [] } # Append the final dict ratings final_dict["UK-PG"].extend(all_ratings["UK:PG"]) final_dict["UK-12-12A"].extend(all_ratings["UK:12"]) final_dict["UK-12-12A"].extend(all_ratings["UK:12A"]) final_dict["UK-15"].extend(all_ratings["UK:15"]) final_dict["UK-18"].extend(all_ratings["UK:18"]) """ final_dict["USA-G"].extend(all_ratings["USA:G"]) final_dict["USA-PG-PG13"].extend(all_ratings["USA:PG"]) final_dict["USA-PG-PG13"].extend(all_ratings["USA:PG13"]) final_dict["USA-R"].extend(all_ratings["USA:R"]) """ """ final_name_dict = { "USA-G": [], "USA-PG-PG13": [], "USA-R": [], } """ final_name_dict = { "UK-PG": [], "UK-12-12A": [], "UK-15": [], "UK-18": [], } # Append the final dict good names final_name_dict["UK-PG"].extend(new_ratings["UK:PG"]) final_name_dict["UK-12-12A"].extend(new_ratings["UK:12"]) final_name_dict["UK-12-12A"].extend(new_ratings["UK:12A"]) final_name_dict["UK-15"].extend(new_ratings["UK:15"]) final_name_dict["UK-18"].extend(new_ratings["UK:18"]) """ final_name_dict["USA-G"].extend(new_ratings["USA:G"]) final_name_dict["USA-PG-PG13"].extend(new_ratings["USA:PG"]) final_name_dict["USA-PG-PG13"].extend(new_ratings["USA:PG13"]) final_name_dict["USA-R"].extend(new_ratings["USA:R"]) """ # Create a unique list of the entities found entities_found = [] for key, items in new_ratings.items(): for i in items: entities_found.append(i) entities_found = np.unique(entities_found) print(len(entities_found)) # Get the en_names back... jacked_up_entities_found = [] for n in entities_found: new_n = n.split()[:-1] jacked_up_entities_found.append(dt.removeEverythingFromString(" ".join(new_n))) classes = [[0]*len(entities_found),[0]*len(entities_found),[0]*len(entities_found),[0]*len(entities_found)] counter = 0 class_names = [] for key, items in final_dict.items(): for i in items: for e in range(len(jacked_up_entities_found)): if i == jacked_up_entities_found[e]: classes[counter][e] = 1 class_names.append(key) counter += 1 classes = np.asarray(classes).transpose() indexes_to_delete = [] for c in range(len(classes)): found = False for i in classes[c]: if i == 1: found = True break if not found: indexes_to_delete.append(c) classes = np.delete(classes, indexes_to_delete, axis=0) entities_found = np.delete(entities_found, indexes_to_delete) classes = classes.transpose() for c in range(len(classes)): dt.write1dArray(classes[c], "../data/movies/classify/uk-ratings/class-" + class_names[c]) classes = classes.transpose() dt.write2dArray(classes, "../data/movies/classify/uk-ratings/class-all") dt.write1dArray(entities_found, "../data/movies/classify/uk-ratings/available_entities.txt") dt.write1dArray(class_names, "../data/movies/classify/uk-ratings/names.txt") print("k")
def getVectors(input_folder, file_names_fn, extension, output_folder, only_words_in_x_entities, words_without_x_entities, cut_first_line=False, get_all=False, additional_name="", make_individual=True, classification="", use_all_files="", minimum_words=0, data_type="", sparse_matrix=False, word_count_amt = 0): if use_all_files is None: file_names = dt.import1dArray(file_names_fn) else: file_names = dt.getFns(use_all_files) phrase_dict = defaultdict(int) failed_indexes = [] failed_filenames = [] working_filenames = [] # First, get all possible phrase names and build a dictionary of them from the files for f in range(len(file_names)): try: full_name = input_folder + file_names[f] + "." + extension phrase_list = dt.import2dArray(full_name, "s") if cut_first_line: phrase_list = phrase_list[1:] word_count = 0 for p in phrase_list: word_count += int(p[1]) if word_count > word_count_amt: for p in phrase_list: if p[0] != "all": phrase_dict[p[0]] += 1 else: print("found class all") working_filenames.append(file_names[f]) else: print("Failed, <1k words", file_names[f], f, word_count) failed_filenames.append(file_names[f]) failed_indexes.append(f) except FileNotFoundError: print("Failed to find", file_names[f], f) failed_filenames.append(file_names[f]) failed_indexes.append(f) print(failed_indexes) print(failed_filenames) phrase_sets = [] # Convert to array so we can sort it phrase_list = [] entity_names = dt.import1dArray(file_names_fn) matching_filenames = [] failed_fns = [] if data_type == "wines": for e in entity_names: found = False for f in working_filenames: if "zz" in f: new_f = f[2:] else: new_f = f if dt.removeEverythingFromString(e) == dt.removeEverythingFromString(new_f): matching_filenames.append(f) found = True break if not found: failed_fns.append(e) working_filenames = np.unique(np.asarray(matching_filenames)) test_dupes = np.unique(np.asarray(working_filenames)) print(len(test_dupes)) for key, value in phrase_dict.items(): if value >= only_words_in_x_entities: phrase_list.append(key) all_phrases = [] for key, value in phrase_dict.items(): all_phrases.append(key) phrase_sets.append(phrase_list) counter = 0 for phrase_list in phrase_sets: if not get_all and counter > 0: break all_phrase_fn = output_folder+"frequency/phrases/" + "class-all-" +str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification phrase_name_fn = output_folder + "names/" +str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification +".txt" phrase_list = sorted(phrase_list) print("Found", len(phrase_list), "Phrases") print(phrase_list[:20]) print("Failed", len(failed_filenames), "Files") print(failed_filenames[:20]) phrase_index_dict = defaultdict() # Create a dictionary to obtain the index of a phrase that's being checked for p in range(len(phrase_list)): phrase_index_dict[phrase_list[p]] = p # Create an empty 2d array to store a matrix of movies and phrases all_phrases_complete = [] for f in working_filenames: all_phrases_complete.append([0]*len(phrase_list)) all_phrases_complete = np.asarray(all_phrases_complete) print("Each entity is length", len(all_phrases_complete[0])) print("The overall matrix is", len(all_phrases_complete)) if sparse_matrix: all_phrases_complete = sp.csr_matrix(all_phrases_complete) # Then, populate the overall bag of words for each film (with all other phrases already set to 0 completed_index = [] if data_type == "wines": print("wines") """ merge_indexes = [] for f in range(len(working_filenames)): print(working_filenames[f]) for i in range(len(working_filenames)): if i == f: continue for ci in completed_index: if i == ci: continue if "~" in working_filenames[i]: if working_filenames[f] == working_filenames[i][:-1] or working_filenames[f] == working_filenames[i][2:-1]: completed_index.append(i) merge_indexes.append((f, i)) """ for f in range(len(working_filenames)): n_phrase_list = dt.import2dArray(input_folder + working_filenames[f] + "." + extension, "s") if cut_first_line: n_phrase_list = n_phrase_list[1:] for p in n_phrase_list: phrase = p[0] try: phrase_index = phrase_index_dict[phrase] if not sparse_matrix: all_phrases_complete[f][phrase_index] = int(p[1]) else: all_phrases_complete[f, phrase_index] = int(p[1]) #print("Kept", phrase) except KeyError: continue #print("Deleted phrase", phrase) """ cols_to_delete = [] if data_type == "wines": for mt in merge_indexes: for v in range(len(all_phrases_complete)): all_phrases_complete[v][mt[0]] += all_phrases_complete[v][mt[1]] cols_to_delete.append(mt[1]) all_phrases_complete = np.delete(all_phrases_complete, cols_to_delete, 1) working_filenames = np.delete(working_filenames, cols_to_delete) """ # Import entities specific to the thing # Trim the phrases of entities that aren't included in the classfication if classification != "all" and classification != "mixed" and classification != "genres" and classification != "ratings" and classification != "types": classification_entities = dt.import1dArray("../data/" + data_type + "/classify/" + classification + "/available_entities.txt") all_phrases_complete = dt.match_entities(all_phrases_complete, classification_entities, file_names) elif classification == "all": print("All~~~~~~~~~~~~~~") dt.write1dArray(working_filenames, "../data/"+data_type+"/classify/"+classification+"/available_entities.txt") if not sparse_matrix: all_phrases_complete = np.asarray(all_phrases_complete).transpose() else: all_phrases_complete = all_phrases_complete.transpose() indexes_to_delete = [] if sparse_matrix: cx = sp.coo_matrix(all_phrases_complete) indexes_to_delete = [] for i, j, v in zip(cx.row, cx.col, cx.data): print "(%d, %d), %s" % (i, j, v) for a in range(len(all_phrases_complete)): if np.count_nonzero(all_phrases_complete[a]) > len(all_phrases_complete[a]) - (words_without_x_entities): print("Recorded an entity " + str(phrase_list[a]) + " with too little difference") indexes_to_delete.append(a) indexes_to_delete.sort() indexes_to_delete.reverse() for i in indexes_to_delete: all_phrases_complete = np.delete(all_phrases_complete, i, 0) print("Deleted an entity " + str(phrase_list[i]) + " with too little difference") phrase_list = np.delete(phrase_list, i, 0) dt.write1dArray(phrase_list, phrase_name_fn) if make_individual: for p in range(len(all_phrases_complete)): dt.write1dArray(all_phrases_complete[p], output_folder+"frequency/phrases/class-" + phrase_list[p] + "-"+str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification) dt.write2dArray(all_phrases_complete, all_phrase_fn) print("Created class-all") all_phrases_complete = np.asarray(all_phrases_complete).transpose() for a in range(len(all_phrases_complete)): for v in range(len(all_phrases_complete[a])): if all_phrases_complete[a][v] > 1: all_phrases_complete[a][v] = 1 all_phrases_complete = np.asarray(all_phrases_complete).transpose() if make_individual: for p in range(len(all_phrases_complete)): dt.write1dArray(all_phrases_complete[p], output_folder+"binary/phrases/class-" + phrase_list[p] + "-"+str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification) all_phrase_fn = output_folder + "binary/phrases/" + "class-all-" + str( only_words_in_x_entities) + "-" + str(words_without_x_entities) + "-" + classification dt.write2dArray(all_phrases_complete, all_phrase_fn) print("Created class-all binary") counter += 1
uk_us_ents = [] for e in uk_entities: uk_us_ents.append(e) for e in us_entities: uk_us_ents.append(e) entities_unique = np.unique(uk_us_ents) correct_format = [] removed_punct = [] for j in all_entities: removed_punct.append(dt.removeEverythingFromString(j)) for i in entities_unique: i = dt.removeEverythingFromString(i) for j in range(len(all_entities)): if i == removed_punct[j]: correct_format.append(all_entities[j]) break new_class_all = [[0]*len(entities_unique), [0]*len(entities_unique), [0]*len(entities_unique), [0]*len(entities_unique), [0] * len(entities_unique), [0]*len(entities_unique), [0]*len(entities_unique)] clean_ent_unique = [] clean_uk_ent = [] clean_us_ent = []