コード例 #1
0
def match_entities(entity_fn, t_entity_fn, entities_fn, classification):
    names = dt.import1dArray(entity_fn)
    t_names = dt.import1dArray(t_entity_fn)
    entities = dt.import2dArray(entities_fn)
    indexes_to_delete = []
    amount_found = 0
    for n in range(len(names)):
        names[n] = dt.removeEverythingFromString(names[n])
    for n in range(len(t_names)):
        t_names[n] = dt.removeEverythingFromString(t_names[n])
    matched_ids = []
    for n in range(len(t_names)):
        for ni in range(len(names)):
            matched_name = t_names[n]
            all_name = names[ni]
            if matched_name == all_name:
                print(matched_name)
                matched_ids.append(ni)
                break
    matched_entities = []
    for e in matched_ids:
        matched_entities.append(entities[e])

    print("Amount found", amount_found)
    dt.write2dArray(matched_entities, entities_fn[:len(entities_fn)-4] + "-" + classification + ".txt")
コード例 #2
0
def getTop250Movies(entity_names):
    imdb = dt.import1dArray("../data/raw/imdb/ratings/ratings.list")[28:278]
    orig_en = entity_names
    for e in range(len(entity_names)):
        entity_names[e] = "".join(entity_names[e].split()[:-1])
        entity_names[e] = dt.removeEverythingFromString(entity_names[e])
    top_en = []

    for string in imdb:
        string =string.split(".")[1][1:]
        string =string.split()[:-1]
        string = " ".join(string)
        string = dt.removeEverythingFromString(string)
        top_en.append(string)
    matched_index = []
    for e in range(len(entity_names)):
        for x in range(len(top_en)):
            if entity_names[e] == top_en[x]:
                matched_index.append(e)
                print(entity_names[e])
                break
    dt.write1dArray(matched_index, "../data/movies/top_imdb_indexes.txt")
コード例 #3
0
def writeClassesFromNames(folder_name, file_names, output_folder):
    names = dt.getFolder(folder_name)
    all_names = defaultdict(int)
    entity_names = dt.import1dArray(file_names)
    translator = str.maketrans({key: None for key in string.punctuation})

    for type in range(len(names)):
        for n in range(len(names[type])):
            names[type][n] = dt.removeEverythingFromString(names[type][n])
            all_names[names[type][n]] += 1
    available_class_names = []
    available_indexes = []
    for n in range(len(entity_names)):
        name = entity_names[n]
        original_name = name
        name = dt.removeEverythingFromString(name)
        if all_names[name] > 0:
            available_class_names.append(original_name)
            available_indexes.append(n)
            print(name, "exists")
        else:
            print(name, "FAIL")
    dt.write1dArray(available_indexes, output_folder + "available_indexes.txt")
    dt.write1dArray(available_class_names, output_folder + "available_entities.txt")
    print("Wrote available indexes and entities")
    class_all = []
    for c in range(len(names)):
        binary_class = []
        for n in range(len(available_class_names)):
            available_class_names[n] = dt.removeEverythingFromString(available_class_names[n])
            if available_class_names[n] in names[c]:
                binary_class.append(1)
            else:
                binary_class.append(0)
        dt.write1dArray(binary_class, output_folder + "class-"+str(c)+"")
        class_all.append(binary_class)
    dt.write2dArray(np.asarray(class_all).transpose(), output_folder + "class-all")
    print("Wrote class-all")
コード例 #4
0
def importCertificates(cert_fn, entity_name_fn):
    all_lines = dt.import1dArray(cert_fn)[14:]
    en = dt.import1dArray(entity_name_fn)
    original_en = dt.import1dArray(entity_name_fn)
    en_name = []
    en_year = []
    for e in range(len(en)):
        split = en[e].split()
        en_year.append(split[len(split)-1])
        name = "".join(split[:len(split)-1])
        en_name.append(dt.removeEverythingFromString(name))


    # Initialize ratings dict
    """
    ratings = {
        "USA:G": [],
        "USA:PG": [],
        "USA:PG-13": [],
        "USA:R": []
    }
    """
    ratings = {
        "UK:PG": [],
        "UK:12": [],
        "UK:12A": [],
        "UK:15": [],
        "UK:18": [],
    }

    all_ratings = defaultdict(list)
    recently_found_name = ""
    recently_found_year = ""
    recently_found_found = False
    counter = 0

    temp_fn = "../data/temp/uk_cert_dict.pickle"

    if dt.fileExists(temp_fn) is False:
        for line in all_lines:
            line = line.split("\t")
            split_ny = line[0].split("{")[0]
            split_ny = split_ny.split()
            for i in range(len(split_ny)-1, -1, -1):
                if "{" in split_ny[i]:
                    del split_ny[i]
            entity_year_bracketed = split_ny[len(split_ny)-1]

            if "(V)" in entity_year_bracketed or "(TV)" in entity_year_bracketed or "(VG)" in entity_year_bracketed:
                entity_year_bracketed = split_ny[len(split_ny) - 2]
            try:
                entity_year = dt.keepNumbers(entity_year_bracketed)[0]
                entity_name = dt.removeEverythingFromString("".join(split_ny[:len(split_ny)-1]))
                found = False
                skip = False
                if recently_found_name == entity_name and recently_found_year == entity_year:
                    skip = True
                    found = recently_found_found
                if not skip:
                    if not found:
                        for n in range(len(en_name)):
                            if entity_name == en_name[n] and entity_year == en_year[n]:
                                print("found", entity_name, entity_year)
                                found = True
                                break
                if found:
                    if("(" not in line[len(line)-1]):
                        entity_rating = line[len(line)-1]
                    else:
                        entity_rating = line[len(line)-2]
                    all_ratings[entity_rating].append(entity_name)
                    if entity_rating in ratings:
                        ratings[entity_rating].append(entity_name)
                        print("rating correct", entity_name, entity_year, entity_rating)
            except IndexError:
                print("IndexError")
                print(line)
                print(split_ny)
                print(entity_year_bracketed)
            recently_found_name = entity_name
            recently_found_year = entity_year
            recently_found_found = found
            counter += 1
            if counter % 1000 == 0:
                    print(counter)
        # Store data (serialize)
        with open(temp_fn, 'wb') as handle:
            pickle.dump(ratings, handle, protocol=pickle.HIGHEST_PROTOCOL)        # Store data (serialize)
        with open("../data/temp/uk_cert_dict_all.pickle", 'wb') as handle:
            pickle.dump(all_ratings, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Load data (deserialize)
    with open(temp_fn, 'rb') as handle:
        ratings = pickle.load(handle)
    if dt.fileExists("../data/temp/uk_cert_dict_all.pickle"):
        with open("../data/temp/uk_cert_dict_all.pickle", 'rb') as handle:
            all_ratings = pickle.load(handle)

    top_size = 0
    for key, value in all_ratings.items():
        top_size += len(value)
    print(top_size)
    top_size = 0

    new_ratings = defaultdict(list)
    real_name_dict_fn = "../data/temp/uk_real_name_dict.dict"
    if dt.fileExists(real_name_dict_fn) is False:
        # Match the names back to the original names
        for key, value in all_ratings.items():
            for r in ratings:
                if r == key:
                    top_size += len(value)
                    for v in range(len(value)):
                        found = False
                        for n in range(len(en_name)):
                            if value[v] == en_name[n]:
                                found = True
                                value[v] = original_en[n]
                                break
                        if found:
                            new_ratings[key].append(value[v])
                    break
        with open(real_name_dict_fn, 'wb') as handle:
            pickle.dump(new_ratings, handle, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        with open(real_name_dict_fn, 'rb') as handle:
            new_ratings = pickle.load(handle)
                # Get the final dict setup
    """
    final_dict = {
        "USA-G": [],
        "USA-PG-PG13": [],
        "USA-R": [],
    }
    """
    final_dict = {
        "UK-PG": [],
        "UK-12-12A": [],
        "UK-15": [],
        "UK-18": []
    }

    # Append the final dict ratings

    final_dict["UK-PG"].extend(all_ratings["UK:PG"])
    final_dict["UK-12-12A"].extend(all_ratings["UK:12"])
    final_dict["UK-12-12A"].extend(all_ratings["UK:12A"])
    final_dict["UK-15"].extend(all_ratings["UK:15"])
    final_dict["UK-18"].extend(all_ratings["UK:18"])
    """
    final_dict["USA-G"].extend(all_ratings["USA:G"])
    final_dict["USA-PG-PG13"].extend(all_ratings["USA:PG"])
    final_dict["USA-PG-PG13"].extend(all_ratings["USA:PG13"])
    final_dict["USA-R"].extend(all_ratings["USA:R"])
    """
    """
    final_name_dict = {
        "USA-G": [],
        "USA-PG-PG13": [],
        "USA-R": [],

    }
    """
    final_name_dict = {
        "UK-PG": [],
        "UK-12-12A": [],
        "UK-15": [],
        "UK-18": [],
    }

    # Append the final dict good names

    final_name_dict["UK-PG"].extend(new_ratings["UK:PG"])
    final_name_dict["UK-12-12A"].extend(new_ratings["UK:12"])
    final_name_dict["UK-12-12A"].extend(new_ratings["UK:12A"])
    final_name_dict["UK-15"].extend(new_ratings["UK:15"])
    final_name_dict["UK-18"].extend(new_ratings["UK:18"])
    """
    final_name_dict["USA-G"].extend(new_ratings["USA:G"])
    final_name_dict["USA-PG-PG13"].extend(new_ratings["USA:PG"])
    final_name_dict["USA-PG-PG13"].extend(new_ratings["USA:PG13"])
    final_name_dict["USA-R"].extend(new_ratings["USA:R"])
    """

    # Create a unique list of the entities found
    entities_found = []
    for key, items in new_ratings.items():
        for i in items:
            entities_found.append(i)
    entities_found = np.unique(entities_found)
    print(len(entities_found))


    # Get the en_names back...
    jacked_up_entities_found = []
    for n in entities_found:
        new_n = n.split()[:-1]
        jacked_up_entities_found.append(dt.removeEverythingFromString(" ".join(new_n)))

    classes = [[0]*len(entities_found),[0]*len(entities_found),[0]*len(entities_found),[0]*len(entities_found)]
    counter = 0
    class_names = []
    for key, items in final_dict.items():
        for i in items:
            for e in range(len(jacked_up_entities_found)):
                if i == jacked_up_entities_found[e]:
                    classes[counter][e] = 1
        class_names.append(key)
        counter += 1

    classes = np.asarray(classes).transpose()

    indexes_to_delete = []

    for c in range(len(classes)):
        found = False
        for i in classes[c]:
            if i == 1:
                found = True
                break
        if not found:
            indexes_to_delete.append(c)

    classes = np.delete(classes, indexes_to_delete, axis=0)
    entities_found = np.delete(entities_found, indexes_to_delete)

    classes = classes.transpose()

    for c in range(len(classes)):
        dt.write1dArray(classes[c], "../data/movies/classify/uk-ratings/class-" + class_names[c])

    classes = classes.transpose()

    dt.write2dArray(classes, "../data/movies/classify/uk-ratings/class-all")
    dt.write1dArray(entities_found, "../data/movies/classify/uk-ratings/available_entities.txt")
    dt.write1dArray(class_names, "../data/movies/classify/uk-ratings/names.txt")
    print("k")
コード例 #5
0
def  getVectors(input_folder, file_names_fn, extension, output_folder, only_words_in_x_entities,
               words_without_x_entities, cut_first_line=False, get_all=False, additional_name="", make_individual=True,
               classification="", use_all_files="", minimum_words=0, data_type="", sparse_matrix=False, word_count_amt = 0):
    if use_all_files is None:
        file_names = dt.import1dArray(file_names_fn)
    else:
        file_names = dt.getFns(use_all_files)

    phrase_dict = defaultdict(int)
    failed_indexes = []
    failed_filenames = []
    working_filenames = []

    # First, get all possible phrase names and build a dictionary of them from the files

    for f in range(len(file_names)):
        try:
            full_name = input_folder + file_names[f] + "." + extension
            phrase_list = dt.import2dArray(full_name, "s")
            if cut_first_line:
                phrase_list = phrase_list[1:]
            word_count = 0
            for p in phrase_list:
                word_count += int(p[1])
            if word_count > word_count_amt:
                for p in phrase_list:
                    if p[0] != "all":
                        phrase_dict[p[0]] += 1
                    else:
                        print("found class all")
                working_filenames.append(file_names[f])
            else:
                print("Failed, <1k words", file_names[f], f, word_count)
                failed_filenames.append(file_names[f])
                failed_indexes.append(f)
        except FileNotFoundError:
            print("Failed to find", file_names[f], f)
            failed_filenames.append(file_names[f])
            failed_indexes.append(f)
    print(failed_indexes)
    print(failed_filenames)
    phrase_sets = []
    # Convert to array so we can sort it
    phrase_list = []


    entity_names = dt.import1dArray(file_names_fn)
    matching_filenames = []
    failed_fns = []
    if data_type == "wines":
        for e in entity_names:
            found = False
            for f in working_filenames:

                if "zz" in f:
                    new_f = f[2:]
                else:
                    new_f = f
                if dt.removeEverythingFromString(e) == dt.removeEverythingFromString(new_f):
                    matching_filenames.append(f)
                    found = True
                    break
            if not found:
                failed_fns.append(e)

        working_filenames = np.unique(np.asarray(matching_filenames))

    test_dupes = np.unique(np.asarray(working_filenames))
    print(len(test_dupes))

    for key, value in phrase_dict.items():
        if value >= only_words_in_x_entities:
            phrase_list.append(key)
    all_phrases = []
    for key, value in phrase_dict.items():
        all_phrases.append(key)

    phrase_sets.append(phrase_list)
    counter = 0
    for phrase_list in phrase_sets:
        if not get_all and counter > 0:
            break
        all_phrase_fn = output_folder+"frequency/phrases/" + "class-all-" +str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification
        phrase_name_fn = output_folder + "names/"  +str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification +".txt"
        phrase_list = sorted(phrase_list)

        print("Found", len(phrase_list), "Phrases")
        print(phrase_list[:20])
        print("Failed", len(failed_filenames), "Files")
        print(failed_filenames[:20])

        phrase_index_dict = defaultdict()

        # Create a dictionary to obtain the index of a phrase that's being checked

        for p in range(len(phrase_list)):
            phrase_index_dict[phrase_list[p]] = p

        # Create an empty 2d array to store a matrix of movies and phrases
        all_phrases_complete = []
        for f in working_filenames:
            all_phrases_complete.append([0]*len(phrase_list))

        all_phrases_complete = np.asarray(all_phrases_complete)

        print("Each entity is length", len(all_phrases_complete[0]))
        print("The overall matrix is", len(all_phrases_complete))
        if sparse_matrix:
            all_phrases_complete = sp.csr_matrix(all_phrases_complete)


        # Then, populate the overall bag of words for each film (with all other phrases already set to 0

        completed_index = []

        if data_type == "wines":

            print("wines")
            """
            merge_indexes = []
            for f in range(len(working_filenames)):
                print(working_filenames[f])
                for i in range(len(working_filenames)):
                    if i == f:
                        continue
                    for ci in completed_index:
                        if i == ci:
                            continue
                    if "~" in working_filenames[i]:
                        if working_filenames[f] == working_filenames[i][:-1] or working_filenames[f] == working_filenames[i][2:-1]:
                            completed_index.append(i)
                            merge_indexes.append((f, i))
            """

        for f in range(len(working_filenames)):
            n_phrase_list = dt.import2dArray(input_folder + working_filenames[f] + "." + extension, "s")
            if cut_first_line:
                n_phrase_list = n_phrase_list[1:]
            for p in n_phrase_list:
                phrase = p[0]
                try:
                    phrase_index = phrase_index_dict[phrase]
                    if not sparse_matrix:
                        all_phrases_complete[f][phrase_index] = int(p[1])
                    else:
                        all_phrases_complete[f, phrase_index] = int(p[1])

                    #print("Kept", phrase)
                except KeyError:
                    continue
                    #print("Deleted phrase", phrase)
        """

        cols_to_delete = []
        if data_type == "wines":
            for mt in merge_indexes:
                for v in range(len(all_phrases_complete)):
                    all_phrases_complete[v][mt[0]] += all_phrases_complete[v][mt[1]]
                cols_to_delete.append(mt[1])
        all_phrases_complete = np.delete(all_phrases_complete, cols_to_delete, 1)
        working_filenames = np.delete(working_filenames, cols_to_delete)
        """

        # Import entities specific to the thing
        # Trim the phrases of entities that aren't included in the classfication
        if classification != "all" and classification != "mixed" and classification != "genres" and classification != "ratings" and classification != "types":
            classification_entities = dt.import1dArray("../data/" + data_type + "/classify/" + classification + "/available_entities.txt")
            all_phrases_complete = dt.match_entities(all_phrases_complete, classification_entities, file_names)
        elif classification == "all":
            print("All~~~~~~~~~~~~~~")
            dt.write1dArray(working_filenames, "../data/"+data_type+"/classify/"+classification+"/available_entities.txt")
        if not sparse_matrix:
            all_phrases_complete = np.asarray(all_phrases_complete).transpose()
        else:
            all_phrases_complete = all_phrases_complete.transpose()

        indexes_to_delete = []
        if sparse_matrix:
            cx = sp.coo_matrix(all_phrases_complete)

            indexes_to_delete = []

            for i, j, v in zip(cx.row, cx.col, cx.data):
                print
                "(%d, %d), %s" % (i, j, v)
        for a in range(len(all_phrases_complete)):
            if np.count_nonzero(all_phrases_complete[a]) > len(all_phrases_complete[a]) - (words_without_x_entities):
                print("Recorded an entity " + str(phrase_list[a]) + " with too little difference")
                indexes_to_delete.append(a)
        indexes_to_delete.sort()
        indexes_to_delete.reverse()
        for i in indexes_to_delete:
            all_phrases_complete = np.delete(all_phrases_complete, i, 0)
            print("Deleted an entity " + str(phrase_list[i]) + " with too little difference")
            phrase_list = np.delete(phrase_list, i, 0)

        dt.write1dArray(phrase_list, phrase_name_fn)
        if make_individual:
            for p in range(len(all_phrases_complete)):
                dt.write1dArray(all_phrases_complete[p], output_folder+"frequency/phrases/class-" + phrase_list[p] +
                                 "-"+str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification)



        dt.write2dArray(all_phrases_complete, all_phrase_fn)


        print("Created class-all")
        all_phrases_complete = np.asarray(all_phrases_complete).transpose()
        for a in range(len(all_phrases_complete)):
            for v in range(len(all_phrases_complete[a])):
                if all_phrases_complete[a][v] > 1:
                    all_phrases_complete[a][v] = 1

        all_phrases_complete = np.asarray(all_phrases_complete).transpose()

        if make_individual:
            for p in range(len(all_phrases_complete)):
                dt.write1dArray(all_phrases_complete[p], output_folder+"binary/phrases/class-" + phrase_list[p] +
                                "-"+str(only_words_in_x_entities) + "-"+str(words_without_x_entities)+"-"+ classification)



        all_phrase_fn = output_folder + "binary/phrases/" + "class-all-" + str(
            only_words_in_x_entities) + "-" + str(words_without_x_entities) + "-" + classification
        dt.write2dArray(all_phrases_complete, all_phrase_fn)

        print("Created class-all binary")
        counter += 1
コード例 #6
0
    uk_us_ents = []

    for e in uk_entities:
        uk_us_ents.append(e)

    for e in us_entities:
        uk_us_ents.append(e)

    entities_unique = np.unique(uk_us_ents)

    correct_format = []

    removed_punct = []

    for j in all_entities:
        removed_punct.append(dt.removeEverythingFromString(j))

    for i in entities_unique:
        i = dt.removeEverythingFromString(i)
        for j in range(len(all_entities)):
            if i == removed_punct[j]:
                correct_format.append(all_entities[j])
                break

    new_class_all = [[0]*len(entities_unique), [0]*len(entities_unique), [0]*len(entities_unique), [0]*len(entities_unique),
                     [0] * len(entities_unique), [0]*len(entities_unique), [0]*len(entities_unique)]

    clean_ent_unique = []
    clean_uk_ent = []
    clean_us_ent = []