def __init__(self, discrete_labels_fn, ppmi_fn, phrases_fn,
                 phrases_to_check_fn, fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)

        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        counter = 0
        with open(discrete_labels_fn) as f:
            for line in f:
                exists = True
                if phrases_to_check_fn != "":
                    exists = False
                    for i in indexes_to_get:
                        if i == counter:
                            exists = True
                            break
                if exists:
                    discrete_labels = line.split()
                    saveGraph(discrete_labels, ppmi[counter],
                              fn + " " + phrases[counter][6:])
                    print phrases[counter]
                counter += 1
def getMatchedLines(file_name, lines_to_match):
    matched_lines = []
    failed_lines = []
    match_names = []
    match_years = []
    for line in lines_to_match:
        match_names.append(re.split(r'\t+', line)[0])
        match_years.append(re.split(r'\t+', line)[1])
    file = open(file_name, "r")
    lines = file.readlines()
    for i in range(len(lines_to_match)):
        matched = False
        last_movie = ""
        for l in range(len(lines)):
            if matched is True and re.split(r'\t+', lines[l])[0] != last_movie:
                break
            split_line = re.split(r'\t+', lines[l])
            split_line[0] = re.sub(r'\s+', '', split_line[0].translate(None, string.punctuation).lower())
            match_names[i] = re.sub(r'\s+', '', match_names[i].translate(None, string.punctuation).lower())
            if split_line[0] == match_names[i]:
                matched_lines.append(lines[l])
                matched = True
                last_movie = re.split(r'\t+', lines[l])[0]
                print "Found a line for " + last_movie
                continue
        if matched:
            print "Matched", lines_to_match[i]
        else:
            failed_lines.append(lines_to_match[i])
            print "Failed", lines_to_match[i]
    dt.write1dArray(failed_lines, "filmdata/KeywordData/failed_second_match.txt")
    dt.write1dArray(matched_lines, "filmdata/KeywordData/matched_lines_NEW.txt")
def getScoreDifferences(name_word_file1, name_score_file1, name_word_file2, name_score_file2, name):
    word_file1 = open(name_word_file1, "r")
    score_file1 = open(name_score_file1, "r")
    word_lines1 = word_file1.readlines()
    score_lines1 = score_file1.readlines()
    scores1 = []
    words1 = []
    for s in score_lines1:
        scores1.append(float(s.strip()))
    for w in word_lines1:
        words1.append(w.strip())
    word_file2 = open(name_word_file2, "r")
    score_file2 = open(name_score_file2, "r")
    word_lines2 = word_file2.readlines()
    score_lines2 = score_file2.readlines()
    scores2 = []
    words2 = []
    for s in score_lines2:
        scores2.append(float(s))
    for w in word_lines2:
        words2.append(w.strip())
    differences_list = []
    for i in range(len(score_lines1)):
        differences_list.append(scores1[i] - scores2[i])
    most_different_words = [x for (y,x) in sorted(zip(differences_list,words1))]
    differences_list = sorted(differences_list)
    dt.write1dArray(most_different_words, "filmdata/SVM/most_different_words_" + name + ".txt")
    dt.write1dArray(differences_list, "filmdata/SVM/most_different_values_" + name + ".txt")
def getVectors(ordered_IDs, unique_phrases):
    vectors = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))]
    vectors_maintained = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))]
    multi_dictionary = {}
    dict_mapping = {}
    print "Mapping to memory."
    for i in range(len(ordered_IDs)):
        ordered_IDs[i] = str(ordered_IDs[i])
    for i in range(len(ordered_IDs)):
        ordered_IDs[i] = ordered_IDs[i].strip()
        dict_mapping[ordered_IDs[i]] = i
        if ordered_IDs[i] != "-1":
            file = open("filmdata/Tokens/" + ordered_IDs[i] + ".film", "r")
            lines = file.readlines()[1:]
            for line in lines:
                split_line = line.split()
                multi_dictionary[(ordered_IDs[i], split_line[0])] = int(split_line[1])
            file.close()
        else:
            multi_dictionary[(ordered_IDs[i], split_line[0])] = 0
    for up in range(len(unique_phrases)):
        unique_phrases[up] = unique_phrases[up].strip()

    print len("Iterating over memory.")
    for p in range(13177, 25842, 1):
        for key, value in multi_dictionary.iteritems():
            if key[1] == unique_phrases[p]:
                vectors_maintained[p][dict_mapping[key[0].strip()]] = value
                vectors[p][dict_mapping[key[0]]] = 1
        print unique_phrases[p]
        dt.write1dArray(vectors_maintained[p], "filmdata/classesPhrases/nonbinary/class-" + unique_phrases[p])
        dt.write1dArray(vectors[p], "filmdata/classesPhrases/class-" + unique_phrases[p])

    return vectors_maintained, vectors
    def __init__(self, discrete_labels_fn, ppmi_fn, phrases_fn, phrases_to_check_fn, fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)

        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        counter = 0
        with open(discrete_labels_fn) as f:
            for line in f:
                exists = True
                if phrases_to_check_fn != "":
                    exists = False
                    for i in indexes_to_get:
                        if i == counter:
                            exists = True
                            break
                if exists:
                    discrete_labels = line.split()
                    saveGraph(discrete_labels, ppmi[counter], fn + " " + phrases[counter][6:])
                    print phrases[counter]
                counter += 1
def findMissingKeywords(file_name, common_keywords):
    print "?"
    file = open(file_name, "r")
    lines = file.readlines()
    last_film = ""
    movie_strings = dt.importString("filmdata/filmNames.txt")
    standard_strings = []
    indexes = []
    for m in movie_strings:
        m = m[:-5]
        standard_strings.append(m.translate(None, string.punctuation).replace(" ", "").strip().upper())
    for line in lines:
        film_vectors = []
        line = line.strip()
        if len(line) > 2:
            line_split = re.split(r'\t+', line)
            line_split[0] = line_split[0].translate(None, string.punctuation).replace(" ", "").strip().upper()
            file_save = ""
            for m in range(len(standard_strings)):
                if standard_strings[m] == line_split[0]:
                    print "matched", m, standard_strings[m], line_split[0]
                    file_save = str(m)
                    break
            if file_save != "":
                if last_film.strip() != line_split[0].strip() and last_film is not None:
                    print "Succeeded", line_split[0]
                    for m in range(len(standard_strings)):
                        if standard_strings[m] == last_film:
                            indexes.append(m)
                            break
                last_film = line_split[0]
            else:
                print "Failed", line_split[0],
    dt.write1dArray(indexes, "filmdata/MISSING_FROM_MOVIEDATA.txt")
def getIDs(movie_strings):
    ordered_IDs = []
    movie_names = []
    for name in movie_strings:
        movie_names.append(name[:-5])
    id_mappings = open("filmdata/KeywordData/Movie_Most_Common_Keyword_Mapping/films-ids.txt", "r")
    id_mappings_lines = id_mappings.readlines()
    found_name = False
    failed_names = []
    x = 0
    for name in movie_names:
        for line in id_mappings_lines:
            mapping_id = line.split()[0]
            mapping_name = re.split(r'\t+', line)[2]
            if similar(name.upper().strip(), mapping_name.upper().strip()):
                ordered_IDs.append(mapping_id)
                found_name = True
                break
        if found_name is True:
            found_name = False
        else:
            failed_names.append(name)
            ordered_IDs.append(-1)
        x += 1
        print x
    dt.write1dArray(failed_names, "filmdata/KeywordData/NAMES_THAT_FAILED_IDS.txt")
    dt.write1dArray(ordered_IDs, "filmdata/KeywordData/IDsByOriginalOrdering.txt")
def outputTopByVotes(amount_of_votes):
    file = open("filmdata/ratings.list/ratings.list", "r")
    lines = file.readlines()
    top_movies = []
    top_ratings = []
    for line in lines:
        top_ratings.append(int(line.split()[1]))
    top_ratings = np.asarray(top_ratings)
    indices = np.argpartition(top_ratings, -amount_of_votes)[-amount_of_votes:]
    for i in indices:
        just_movie = lines[i].split()[3:]
        just_movie = " ".join(just_movie)
        just_movie = just_movie.split('{')
        just_movie = just_movie[0]
        just_movie = just_movie.split('(')
        try:
            if not re.findall(r'\d+', just_movie[2])[0]:
                del just_movie[2]
            else:
                just_movie[0] = just_movie[0] + "(" + just_movie[1]
                del just_movie[1]
        except IndexError:
            print
        try:
            year = re.findall(r'\d+', just_movie[1])[0]
        except IndexError:
            print "FALED", just_movie
        if just_movie[0].endswith(' '):
            just_movie[0] = just_movie[0][:-1]
        if just_movie[0].startswith('"') and just_movie[0].endswith('"'):
            just_movie[0] = just_movie[0][1:-1]
        just_movie = just_movie[0] + " " + str(year)
        print just_movie
        top_movies.append(just_movie)
    dt.write1dArray(top_movies, "filmdata/top50000moviesbyvotes.txt")
def getVectorsKeywords(movie_strings, keywords):
    multi_dictionary = {}
    dict_mapping = {}
    movie_names = []
    file_names = dt.getAllFileNames("filmdata\KeywordData\Movie_Most_Common_Keyword_Mapping")
    for i in movie_strings:
        movie_names.append(i.strip()[:-5])
        print i
    print "Mapping to memory."
    for i in file_names:
        try:
            file = open("filmdata/KeywordData/Movie_Most_Common_Keyword_Mapping/" +i, "r")
            lines = file.readlines()
            dict_mapping[movie_strings[int(i)]] = i
            for line in lines:
                line = line.strip()
                multi_dictionary[(movie_strings[int(i)], line)] = 1
            file.close()
        except IOError:
            print movie_names[i]

    for up in range(len(keywords)):
        keywords[up] = keywords[up].strip()

    print len("Iterating over memory.")
    for p in range(len(keywords)):
        vector = [0 for x in range(len(movie_strings))]
        print len(vector)
        for key, value in multi_dictionary.iteritems():
            if key[1] == keywords[p]:
                #print int(dict_mapping[key[0]])
                vector[int(dict_mapping[key[0]])] = 1
        print keywords[p]
        dt.write1dArray(vector, "filmdata/classesKeywords/NewData/class-" + keywords[p])
def outputKeywords():
    movie_strings = dt.importString("filmdata/filmNames.txt")
    movie_data = getMovieDataFromIMDB(movie_strings)
    commonality = 0
    common_keywords = getMostCommonKeywords(0, "filmdata/IMDB_movie_data.txt")
    dt.write1dArray(common_keywords, "filmdata/common_keywords_15k_commanility_" + str(commonality))
    vectors = getKeywordVectors(common_keywords, movie_strings, "")
    dt.write2dArray(vectors, "filmdata/classesKeywords/class-extra-all-commonality-" + str(commonality))
def trimMovieData(file_string):
    new_movie_data = []
    movie_data_file = open(file_string)
    with movie_data_file as myFile:
        for num, line in enumerate(myFile, 1):
            if "{" not in line and "(V)" not in line and "(TV)" not in line and "(VG)" not in line and len(line) > 2 and line.startswith(" ") is False and line.startswith("\t") is False and line.startswith("\n") is False:
                new_movie_data.append(line[:-1])
    print "dun"
    dt.write1dArray(new_movie_data, file_string + "trimmed")
def getMissingIndexes(index_list, length):
    full_index = range(length)
    for i in index_list:
        i = int(i)
        full_index[i] = -1
    missing_indexes = []
    for i in full_index:
        if i > -1:
            missing_indexes.append(i)
    dt.write1dArray(missing_indexes, "filmdata/missing_indexes_keywords.txt")
def getIMDBKeywordsForMovieNames(movie_names):
    stripped_movie_names = []
    for movie in movie_names:
        stripped_movie_names.append(movie.replace('\n', ''))
    stripped_movie_names = sorted(stripped_movie_names)
    split_names = []
    split_years = []
    for stripped_movie_name in stripped_movie_names:
        split = stripped_movie_name.split()
        split_year = split[len(split)-1]
        split_years.append(split_year)
        split_names.append(stripped_movie_name[:-len(split_year)-1])

    file = open("filmdata\keywords.list\keywords.list", "r")
    lines = file.readlines()
    keywords_list = lines[79748:]
    matched_lines = []
    x = 0
    last_line = keywords_list[0]
    matched = False
    while x < 50000:
        for line in keywords_list:
            split_line = line.rsplit('(', 2)
            movie_name = split_line[0].rstrip()

            if movie_name.startswith('"') and movie_name.endswith('"'):
                movie_name = movie_name[1:-1]
            try:
                movie_year = str(re.findall(r'\d+', split_line[1])[0])
            except IndexError:
                movie_year = "NULL"
            if not movie_name:
                movie_name = "'NULL"
            formatted_line = movie_name.rstrip() + " " + str(movie_year).rstrip()

            if matched is True and formatted_line == last_line:
                matched_lines.append(line)
                print split_names[x], line
            elif matched is False and similar(movie_name.strip().upper(), split_names[x].strip().upper()) and movie_year == split_years[x]:
                matched = True
                matched_lines.append(line)
                print split_names[x], line
            elif matched is True and formatted_line != last_line:
                matched = False
                x = x + 1
            last_line = formatted_line
        print "cycled through"



    print "Found:", x
    dt.write1dArray(matched_lines, "filmdata/imdb_movie_keywords.txt")
def getXLeastSimilarIndex(term, terms_to_match, terms_to_ignore, amt):
    least_similar_term_indexes = []
    for a in range(amt):
        lowest_term = 99999999
        term_index = 0
        for t in range(len(terms_to_match)):
            if dt.checkIfInArray(terms_to_ignore, t) is False:
                s = getSimilarity(term, terms_to_match[t])
                if s < lowest_term and dt.checkIfInArray(least_similar_term_indexes, t) is False:
                    lowest_term = s
                    term_index = t
        least_similar_term_indexes.append(term_index)
    return least_similar_term_indexes
def getXMostSimilarIndex(term, terms_to_match, terms_to_ignore, amt):
    most_similar_term_indexes = []
    for a in range(amt):
        highest_term = 0
        term_index = 0
        for t in range(len(terms_to_match)):
            if dt.checkIfInArray(terms_to_ignore, t) is False:
                s = getSimilarity(term, terms_to_match[t])
                if s > highest_term and dt.checkIfInArray(most_similar_term_indexes, t) is False:
                    highest_term = s
                    term_index = t
        most_similar_term_indexes.append(term_index)
    return most_similar_term_indexes
    def __init__(self, directions_fn, vectors_fn, cluster_names_fn,
                 vector_names_fn, fn, percent, percentage_increment,
                 by_vector):

        directions = dt.importVectors(directions_fn)
        vectors = dt.importVectors(vectors_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(vector_names_fn)

        rankings = self.getRankings(directions, vectors, cluster_names,
                                    vector_names)
        rankings = np.array(rankings)
        #labels = self.createLabels(rankings, percent)
        #labels = np.asarray(labels)
        discrete_labels = self.createDiscreteLabels(rankings,
                                                    percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
        if by_vector:
            #labels = labels.transpose()
            discrete_labels = discrete_labels.transpose()
            rankings = rankings.transpose()
        #dt.write2dArray(labels, "Rankings/" + fn + "P" + str(percent) +".labels")
        dt.write2dArray(rankings, "Rankings/" + fn + ".space")
        dt.write2dArray(
            discrete_labels,
            "Rankings/" + fn + "P" + str(percentage_increment) + ".discrete")
        array = []
        short_array = []
        """ Disabled names for quick view now
def getXMostSimilarIndex(term, terms_to_match, terms_to_ignore, amt):
    most_similar_term_indexes = []
    for a in range(amt):
        highest_term = 0
        term_index = 0
        for t in range(len(terms_to_match)):
            if dt.checkIfInArray(terms_to_ignore, t) is False:
                s = getSimilarity(term, terms_to_match[t])
                if s > highest_term and dt.checkIfInArray(
                        most_similar_term_indexes, t) is False:
                    highest_term = s
                    term_index = t
        most_similar_term_indexes.append(term_index)
    return most_similar_term_indexes
def getXLeastSimilarIndex(term, terms_to_match, terms_to_ignore, amt):
    least_similar_term_indexes = []
    for a in range(amt):
        lowest_term = 99999999
        term_index = 0
        for t in range(len(terms_to_match)):
            if dt.checkIfInArray(terms_to_ignore, t) is False:
                s = getSimilarity(term, terms_to_match[t])
                if s < lowest_term and dt.checkIfInArray(
                        least_similar_term_indexes, t) is False:
                    lowest_term = s
                    term_index = t
        least_similar_term_indexes.append(term_index)
    return least_similar_term_indexes
def getUnformattedTopByVotes(amount_of_votes):
    file = open("filmdata/ratings.list/ratings.list", "r")
    lines = file.readlines()
    top_movies = []
    top_ratings = []
    for line in lines:
        top_ratings.append(int(line.split()[1]))
    top_ratings = np.asarray(top_ratings)
    indices = np.argpartition(top_ratings, -amount_of_votes)[-amount_of_votes:]
    for i in indices:
        just_movie = lines[i].split()[3:]
        just_movie = " ".join(just_movie)
        print just_movie
        top_movies.append(just_movie)
    dt.write1dArray(top_movies, "filmdata/imdb_formatted_top50000.txt")
def reMapPPMI(ordered_IDs, file_names):
    print "Mapping to memory."
    for i in range(len(ordered_IDs)):
        ordered_IDs[i] = str(ordered_IDs[i])
    for i in range(len(ordered_IDs)):
        for f in range(len(file_names)):
            id = file_names[f].split(".")[0]
            if int(ordered_IDs[i]) == int(id) and int(ordered_IDs[i]) != -1:
                print ordered_IDs[i], id
                file = open("filmdata/vectors/Tokens/" + file_names[f])
                lines = file.readlines()
                dt.write1dArray(lines, "filmdata/NewTokens/"+str(i)+".ppmi")
                file.close()
            elif int(ordered_IDs[i]) == -1:
                dt.write1dArray([[""]], "filmdata/NewTokens/"+str(i)+".error")
def makeConsistent(file_name, new_file_name):
    new_file = []
    with open(file_name) as my_file:
        for num, line in enumerate(my_file, 1):
            line = line.strip()
            name = line[:-4]
            year = line[len(line)-4:]

            name = name.translate(None, string.punctuation)
            year = year.translate(None, string.punctuation)
            new_line = "\t".join([name, year])
            new_file.append(new_line)
            print new_line

    dt.write1dArray(new_file, new_file_name)
def makeKeywordPPMIVectors(file_name, common_keywords):
    print "?"
    file = open(file_name, "r")
    lines = file.readlines()
    last_film = ""
    movie_strings = dt.importString("filmdata/filmNames.txt")
    standard_strings = []
    for m in movie_strings:
        m = m[:-5]
        standard_strings.append(m.translate(None, string.punctuation).replace(" ", "").strip().upper())
    for line in lines:
        film_vectors = []
        line = line.strip()
        if len(line) > 2:
            line_split = re.split(r'\t+', line)
            line_split[0] = line_split[0].translate(None, string.punctuation).replace(" ", "").strip().upper()
            file_save = ""
            for m in range(len(standard_strings)):
                if standard_strings[m] == line_split[0]:
                    file_save = str(m)
                    break
            if file_save != "":
                file = open("filmdata\KeywordData\Movie_Most_Common_Keyword_Mapping\\" + file_save, "a")
                for keyword in common_keywords:
                    if line_split[2] == keyword.strip():
                        film_vectors.append("line")
                        file.write(keyword)
                        break
                if last_film.strip() != line_split[0].strip() and last_film is not None:
                    print "Succeeded", line_split[0]
                    file.close()

                last_film = line_split[0]
            else:
                print "Failed", line_split[0]
Exemple #23
0
def getKNeighbors(vector_path="filmdata/films200.mds/films200.mds", class_path="filmdata/classesGenres/class-All",
                  n_neighbors=1, algorithm="kd_tree", leaf_size=30,
                  training_data=10000, name="normal200"):
    movie_vectors = np.asarray(dt.importVectors(vector_path))
    movie_labels = np.asarray(dt.importLabels(class_path))

    x_train, y_train, x_test, y_test = dt.splitData(training_data, movie_vectors, movie_labels)

    classifier = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size)
    classifier.fit(x_train, y_train.ravel())
    y_pred = classifier.predict(x_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    dt.write1dArray([f1, accuracy], "KNNScores/" + name + ".score")
    print "F1 " + str(f1), "Accuracy", accuracy
def writeMissing(folder_name):
    print "?"
    file_names = dt.getAllFileNames(folder_name)
    standard = range(15000)
    missing = []
    for i in standard:
        found = False
        for f in file_names:
            if int(f) == int(i):
                found = True
                break
        if found:
            print "found", i
        else:
            missing.append(i)
            print "no found", i
    dt.write1dArray(missing, "filmdata/MISSING_KEYWORD_ITEMS.txt")
Exemple #25
0
    def get_code(self, tree, feature_names, class_names, filename):
        left = tree.tree_.children_left
        right = tree.tree_.children_right
        threshold = tree.tree_.threshold
        value = tree.tree_.value

        #print tree.tree_.feature, len(tree.tree_.feature
        # )
        features = []
        for i in tree.tree_.feature:
            if i != -2 or i <= 200:
                features.append(feature_names[i])
        rules_array = []

        def recurse(left, right, threshold, features, node):
            if (threshold[node] != -2):
                line = "IF ( " + features[node] + " <= " + str(
                    threshold[node]) + " ) {"
                rules_array.append(line)
                if left[node] != -1:
                    recurse(left, right, threshold, features, left[node])
                line = "} ELSE {"
                rules_array.append(line)
                if right[node] != -1:
                    recurse(left, right, threshold, features, right[node])
                line = "}"
                rules_array.append(line)
            else:
                if value[node][0][0] >= value[node][0][1]:
                    line = "return", class_names[0]
                    rules_array.append(line)
                else:
                    line = "return", class_names[1]
                    rules_array.append(line)

        recurse(left, right, threshold, features, 0)
        dt.write1dArray(rules_array, "Rules/Statements/" + filename + ".rules")
        cleaned = jsbeautifier.beautify_file("Rules/Statements/" + filename +
                                             ".rules")
        file = open("Rules/Statements/" + filename + ".rules", "w")
        file.write(cleaned)
        file.close()
def getMostCommonKeywords(top_value, file_name, keyword_file, value_file):
    common_keywords = []
    file = open(file_name, "r")
    lines = file.readlines()
    keywords = defaultdict(int)
    for line in lines:
        if len(line.split()) > 0:
            line_split = line.split()
            keyword = line_split[len(line_split)-1]
            keywords[keyword] += 1
        print line
    sorted_dict = sorted(keywords.iteritems(), key=lambda x:-x[1])[:top_value]
    print sorted_dict
    keys = []
    values = []
    for key, value in sorted_dict:
        keys.append(key)
        values.append(value)
    dt.write1dArray(keys, keyword_file)
    dt.write1dArray(values, value_file)
Exemple #27
0
def getKNearestMovies(data, x, k):
    movie_names = dt.importString("filmdata/filmNames.txt")
    kd_tree = spatial.KDTree(data)
    kd_query = kd_tree.query(x=x, k=k)
    nearest_distances = kd_query[0][1:]
    k_nearest = kd_query[1][1:]
    nearest_movies = []
    for k in k_nearest:
        nearest_movies.append(movie_names[k].strip())
    print nearest_movies
    return nearest_movies, nearest_distances
Exemple #28
0
def getKNearestMovies(data, x, k):
    movie_names = dt.importString("filmdata/filmNames.txt")
    kd_tree = spatial.KDTree(data)
    kd_query = kd_tree.query(x=x, k=k)
    nearest_distances = kd_query[0][1:]
    k_nearest = kd_query[1][1:]
    nearest_movies = []
    for k in k_nearest:
        nearest_movies.append(movie_names[k].strip())
    print nearest_movies
    return nearest_movies, nearest_distances
    def __init__(self,
                 epochs=1, learn_rate=0.01, loss="mse", batch_size=1, decay=1e-06,
                 hidden_activation="tanh", layer_init="glorot_uniform", output_activation="tanh",  hidden_layer_size=100,
                 file_name="unspecified_filename", vector_path=None, reg=0,
                 optimizer_name="rmsprop", class_names=None, noise=0, output_weights=None):

        # Initialize the model

        self.model = Sequential()

        # Import the numpy vectors
        try:
            movie_vectors = np.asarray(np.load(vector_path))
        except OSError:
            # If it fails, assume that it's in a standard format for vectors and then save it in numpy format
            movie_vectors = dt.importVectors(vector_path)
            movie_vectors = np.asarray(movie_vectors)
            np.save(file_name, movie_vectors)

        # Set the input and the output to be the same size, as this is an auto-encoder

        input_size = len(movie_vectors[0])
        output_size = len(movie_vectors[0])

        if noise > 0: # If using a noisy autoencoder, add GaussianNoise layers to the start of the encoder
            self.model.add(GaussianNoise(noise, input_shape=(input_size,)))
            self.model.add(Dense(output_dim=hidden_layer_size,  input_dim=input_size, init=layer_init, activation=hidden_activation,W_regularizer=l2(reg)))
        else:
            # Otherwise just add the hidden layer
            self.model.add(Dense(output_dim=hidden_layer_size,  input_dim=input_size, init=layer_init, activation=hidden_activation,W_regularizer=l2(reg)))

        # If using custom weights on the hidden layer to the output layer, apply those custom weights. Otherwise just add output layer.
        if output_weights == None:
            self.model.add(Dense(output_dim=output_size, init=layer_init, activation=output_activation))
        else:
            self.model.add(Dense(output_dim=len(output_weights[0]), init=layer_init, activation=output_activation, weights=output_weights))

        # Compile the model and fit it to the data
        if optimizer_name == "sgd":
            optimizer = SGD(lr=learn_rate, decay=decay)
        elif optimizer_name == "rmsprop":
            optimizer = RMSprop(lr=learn_rate)
        self.model.compile(loss=loss, optimizer=optimizer)
        self.model.fit(movie_vectors, movie_vectors, nb_epoch=epochs, batch_size=batch_size, verbose=1)

        # Create a truncated model that has no output layer that has the same weights as the previous model and use it to obtain the hidden layer representation
        truncated_model = Sequential()
        total_file_name = "newdata/spaces/" + file_name +".mds"
        truncated_model.add(GaussianNoise(noise, input_shape=(input_size,)))
        truncated_model.add(Dense(output_dim=hidden_layer_size, input_dim=input_size, init=layer_init, activation=hidden_activation, W_regularizer=l2(reg)))
        truncated_model.compile(loss=loss, optimizer=optimizer)
        self.end_space = truncated_model.predict(movie_vectors)

        np.save(self.end_space, total_file_name)
    def splitDirections(self, directions_fn, scores_fn, names_fn,
                        low_threshold, high_threshold):
        directions = dt.importVectors(directions_fn)
        scores = dt.importString(scores_fn)
        names = dt.importString(names_fn)

        for s in range(len(scores)):
            scores[s] = float(scores[s].strip())

        high_direction_indexes = []
        high_direction_scores = []
        low_direction_indexes = []
        low_direction_scores = []

        for s in range(len(scores)):
            if scores[s] >= high_threshold:
                high_direction_indexes.append(s)
                high_direction_scores.append(scores[s])
            elif scores[s] >= low_threshold:
                low_direction_indexes.append(s)
                low_direction_scores.append(scores[s])

        sorted_h_indexes = dt.sortByArray(high_direction_indexes,
                                          high_direction_scores)
        sorted_l_indexes = dt.sortByArray(low_direction_indexes,
                                          low_direction_scores)
        sorted_h_indexes.reverse()
        sorted_l_indexes.reverse()
        high_direction_names = []
        low_direction_names = []
        high_directions = []
        low_directions = []
        for s in sorted_h_indexes:
            high_directions.append(directions[s])
            high_direction_names.append(names[s][6:])
        for s in sorted_l_indexes:
            low_directions.append(directions[s])
            low_direction_names.append(names[s][6:])

        return high_direction_names, low_direction_names, high_directions, low_directions
Exemple #31
0
    def __init__(self, cluster_vectors_fn, cluster_labels_fn, movie_names_fn,
                 label_names_fn, cluster_names_fn, filename, training_data,
                 cluster_to_classify, max_depth):

        vectors = dt.importVectors(cluster_vectors_fn)
        labels = dt.importLabels(cluster_labels_fn)
        cluster_names = dt.importString(cluster_names_fn)
        vector_names = dt.importString(movie_names_fn)
        label_names = dt.importString(label_names_fn)
        scores_array = []
        for l in range(len(labels[0])):
            new_labels = [0] * 15000
            for x in range(len(labels)):
                new_labels[x] = labels[x][l]
            x_train = np.asarray(vectors[:training_data])
            x_test = np.asarray(vectors[training_data:])
            y_train = np.asarray(new_labels[:training_data])
            y_test = np.asarray(new_labels[training_data:])

            self.clf = tree.DecisionTreeClassifier(max_depth=max_depth)
            self.clf = self.clf.fit(x_train, y_train)

            y_pred = self.clf.predict(x_test)
            f1 = f1_score(y_test, y_pred, average='binary')
            accuracy = accuracy_score(y_test, y_pred)
            scores = [[label_names[l], "f1", f1, "accuracy", accuracy]]
            print scores[0]
            scores_array.append(scores)

            class_names = [label_names[l], "NOT " + label_names[l]]
            tree.export_graphviz(self.clf,
                                 feature_names=cluster_names,
                                 class_names=class_names,
                                 out_file='Rules/' + label_names[l] +
                                 filename + '.dot',
                                 max_depth=10)
            """
            rewrite_dot_file = dt.importString('Rules/'+filename+label_names[l]+'.dot')
            new_dot_file = []
            for s in rewrite_dot_file:
                new_string = s
                if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s:
                    index = s.index("value")
                    new_string = s[:index] + '"] ;'
                new_dot_file.append(new_string)
            dt.write1dArray(new_dot_file, 'Rules/Cleaned'+filename+label_names[l]+'.dot')
            """
            graph = pydot.graph_from_dot_file('Rules/' + label_names[l] +
                                              filename + '.dot')
            graph.write_png('Rules/Images/' + label_names[l] + filename +
                            ".png")
            self.get_code(self.clf, cluster_names, class_names,
                          label_names[l] + filename)
        dt.write1dArray(scores_array, 'Rules/Scores/' + filename + '.scores')
def getMovieDataFromIMDB(movie_strings):
    movie_data = []
    write_line_file = open("filmdata/Found_Missing.txt", "w")
    failed_movies = []
    names = []
    years = []
    for movie_string in movie_strings:
        names.append(movie_string[:-6])
        years.append(movie_string[-5:].strip())

    for n in range(len(names)):
        found = False
        last_name = ""
        old_num = 0
        with open("filmdata/IMDB_movie_data.txttrimmed") as myFile:
            for num, line in enumerate(myFile, 1):
                num = old_num
                split_line = re.split(r'\t+', line)
                movie_string = split_line[0]
                movie_name = movie_string.split()
                del movie_name[len(movie_name)-1]
                movie_name = " ".join(movie_name)
                if found is True and last_name != movie_name:
                    break
                movie_year = int(re.findall(r'\d+', movie_string.split()[len(movie_string.split())-1])[0])
                if similar(names[n].upper().strip(), movie_name.upper().strip()) and int(years[n]) == int(movie_year):
                    movie_data.append(line)
                    write_line_file.write(line)
                    found = True
                    old_num = num
                last_name = movie_name
        if found is False:
            failed_movies.append(names[n])
            print "FAILED:", names[n]
    print "Total failed", len(failed_movies)
    write_line_file.close()
    dt.write1dArray(failed_movies, "filmdata/failed_movies_final_push.txt")
    return movie_data
    def get_code(self, tree, feature_names, class_names, filename):
        left      = tree.tree_.children_left
        right     = tree.tree_.children_right
        threshold = tree.tree_.threshold
        value = tree.tree_.value

        #print tree.tree_.feature, len(tree.tree_.feature
        # )
        features = []
        for i in tree.tree_.feature:
            if i != -2 or i <= 200:
                features.append(feature_names[i])
        rules_array = []
        def recurse(left, right, threshold, features,  node):
                if (threshold[node] != -2):
                        line = "IF ( " + features[node] + " <= " + str(threshold[node]) + " ) {"
                        rules_array.append(line)
                        if left[node] != -1:
                                recurse (left, right, threshold, features,left[node])
                        line = "} ELSE {"
                        rules_array.append(line)
                        if right[node] != -1:
                                recurse (left, right, threshold, features,right[node])
                        line = "}"
                        rules_array.append(line)
                else:
                        if value[node][0][0] >= value[node][0][1]:
                            line = "return", class_names[0]
                            rules_array.append(line)
                        else:
                            line = "return", class_names[1]
                            rules_array.append(line)
        recurse(left, right, threshold, features, 0)
        dt.write1dArray(rules_array, "Rules/Statements/"+filename+".rules")
        cleaned = jsbeautifier.beautify_file("Rules/Statements/"+filename+".rules")
        file = open("Rules/Statements/"+filename+".rules", "w")
        file.write(cleaned)
        file.close()
def getNextClusterTerm(cluster_terms, terms_to_match, terms_to_ignore, amt):
    min_value = 999999999999999
    min_index = 0
    for t in range(len(terms_to_match)):
        max_value = 0
        if dt.checkIfInArray(terms_to_ignore, t) is False:
            for c in range(len(cluster_terms)):
                s = getSimilarity(cluster_terms[c], terms_to_match[t])
                if s > max_value:
                    max_value = s
            if max_value < min_value:
                min_value = max_value
                min_index = t
    return min_index
def getNextClusterTerm(cluster_terms, terms_to_match, terms_to_ignore, amt):
    min_value = 999999999999999
    min_index = 0
    for t in range(len(terms_to_match)):
        max_value = 0
        if dt.checkIfInArray(terms_to_ignore, t) is False:
            for c in range(len(cluster_terms)):
                s = getSimilarity(cluster_terms[c], terms_to_match[t])
                if s > max_value:
                    max_value = s
            if max_value < min_value:
                min_value = max_value
                min_index = t
    return min_index
def makeConsistentKeywords(file_name, new_file_name):
    new_file = []
    with open(file_name) as my_file:
        for num, line in enumerate(my_file, 1):
            if "{" in line or "?" in line:
                continue
            split_line = re.split(r'\t+', line)
            split_on_bracket = split_line[0].split(" (")
            if split_on_bracket[1].startswith("1") == False and split_on_bracket[1].startswith("2") == False:
                year = split_on_bracket[2][:4]
                name = "".join([split_on_bracket[0], split_on_bracket[1]])
            else:
                year = split_on_bracket[1][:4]
                name = split_on_bracket[0]
            name = name.translate(None, string.punctuation)
            year = year.translate(None, string.punctuation)
            keyword = re.sub(r'\s+', '', split_line[1]).translate(None, string.punctuation)
            name_and_year = "\t".join([name, year])
            new_line = "\t".join([name_and_year, keyword])
            new_file.append(new_line)
            print new_line

    dt.write1dArray(new_file, new_file_name)
def getVectorsIO(ordered_IDs, unique_phrases):
    vectors = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))]
    vectors_maintained = [[0 for x in range(len(ordered_IDs))] for x in range(len(unique_phrases))]

    for p in range(11212, 25842, 1):
        unique_phrases[p] = unique_phrases[p].strip()
        for i in range(len(ordered_IDs)):
            ordered_IDs[i] = ordered_IDs[i].strip()
            if ordered_IDs[i] != "-1":
                file = open("filmdata/Tokens/" + ordered_IDs[i] + ".film", "r")
                lines = file.readlines()[1:]
                for line in lines:
                    split_line = line.split()
                    split_line[1] = split_line[1].strip()
                    if split_line[0] == p:
                        vectors_maintained[p][i] = split_line[1]
                        vectors[p][i] = 1
                file.close()
        print unique_phrases[p]
        dt.write1dArray(vectors_maintained[p], "filmdata/classesPhrases/nonbinary/class-" + unique_phrases[p])
        dt.write1dArray(vectors[p], "filmdata/classesPhrases/class-" + unique_phrases[p])

    return vectors_maintained, vectors
    def splitDirections(self, directions_fn, scores_fn, names_fn, low_threshold, high_threshold):
        directions = dt.importVectors(directions_fn)
        scores = dt.importString(scores_fn)
        names = dt.importString(names_fn)

        for s in range(len(scores)):
            scores[s] = float(scores[s].strip())

        high_direction_indexes = []
        high_direction_scores = []
        low_direction_indexes = []
        low_direction_scores = []

        for s in range(len(scores)):
            if scores[s] >= high_threshold:
                high_direction_indexes.append(s)
                high_direction_scores.append(scores[s])
            elif scores[s] >= low_threshold:
                low_direction_indexes.append(s)
                low_direction_scores.append(scores[s])

        sorted_h_indexes = dt.sortByArray(high_direction_indexes,   high_direction_scores)
        sorted_l_indexes = dt.sortByArray(low_direction_indexes , low_direction_scores)
        sorted_h_indexes.reverse()
        sorted_l_indexes.reverse()
        high_direction_names = []
        low_direction_names = []
        high_directions = []
        low_directions = []
        for s in sorted_h_indexes:
            high_directions.append(directions[s])
            high_direction_names.append(names[s][6:])
        for s in sorted_l_indexes:
            low_directions.append(directions[s])
            low_direction_names.append(names[s][6:])

        return high_direction_names, low_direction_names, high_directions, low_directions
Exemple #39
0
def getKNeighbors(vector_path="filmdata/films200.mds/films200.mds",
                  class_path="filmdata/classesGenres/class-All",
                  n_neighbors=1,
                  algorithm="kd_tree",
                  leaf_size=30,
                  training_data=10000,
                  name="normal200"):
    movie_vectors = np.asarray(dt.importVectors(vector_path))
    movie_labels = np.asarray(dt.importLabels(class_path))

    x_train, y_train, x_test, y_test = dt.splitData(training_data,
                                                    movie_vectors,
                                                    movie_labels)

    classifier = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors,
                                                algorithm=algorithm,
                                                leaf_size=leaf_size)
    classifier.fit(x_train, y_train.ravel())
    y_pred = classifier.predict(x_test)

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    dt.write1dArray([f1, accuracy], "KNNScores/" + name + ".score")
    print "F1 " + str(f1), "Accuracy", accuracy
    def __init__(self, name_distinction="", class_names=None, vector_path=None, class_path=None, class_by_class=True, input_size=200,
                 training_data=10000, amount_of_scores=400,  low_kappa=0.1, high_kappa=0.5, rankSVM=False, amount_to_cut_at=100, largest_cut=21470000):
        print "getting movie data"

        movie_vectors = dt.importVectors(vector_path)
        movie_labels = dt.importLabels(class_path)
        print "getting file names"

        file_names = dt.getFns(class_path[:-10])

        print len(movie_labels), len(movie_labels[0])

        print "getting training and test data"

        x_train = np.asarray(movie_vectors[:training_data])
        x_test = np.asarray(movie_vectors[training_data:])

        movie_labels = zip(*movie_labels)
        file_names, movie_labels = self.getSampledData(file_names, movie_labels, amount_to_cut_at, largest_cut)
        movie_labels = zip(*movie_labels)

        y_train = movie_labels[:training_data]
        y_test = movie_labels[training_data:]
        y_train = np.asarray(zip(*y_train))
        y_test = np.asarray(zip(*y_test))



        print len(y_train), len(y_test), training_data

        print "getting kappa scores"

        kappa_scores, directions =   self.runAllSVMs(y_test, y_train, x_train, x_test, file_names)

        dt.write1dArray(kappa_scores, "SVMResults/"+name_distinction+".scores")
        dt.write1dArray(file_names, "SVMResults/"+name_distinction+".names")

        dt.write2dArray(directions, "directions/"+name_distinction+".directions")
    def __init__(self, low_threshold, high_threshold, filename):

        hdn, ldn, hd, ld = self.splitDirections(
            "Directions/" + filename + ".directions",
            "SVMResults/" + filename + ".scores",
            "SVMResults/" + filename + ".names", low_threshold, high_threshold)

        least_similar_cluster_names, cluster_name_dict, least_similar_clusters = self.createTermClusters(
            hd, ld, hdn, ldn)

        dt.write1dArray(
            least_similar_cluster_names,
            "Clusters/" + filename + "LeastSimilarHIGH" + str(high_threshold) +
            "," + str(low_threshold) + ".names")
        dt.write2dArray(
            least_similar_clusters,
            "Clusters/" + filename + "LeastSimilarHIGH" + str(high_threshold) +
            "," + str(low_threshold) + ".clusters")
        dt.writeArrayDict(
            cluster_name_dict, "Clusters/" + filename + "MostSimilarCLUSTER" +
            str(high_threshold) + "," + str(low_threshold) + ".names")
Exemple #42
0
    def __init__(self,
                 epochs=1,
                 learn_rate=0.01,
                 loss="mse",
                 batch_size=1,
                 decay=1e-06,
                 hidden_activation="tanh",
                 layer_init="glorot_uniform",
                 output_activation="tanh",
                 hidden_layer_size=100,
                 file_name="unspecified_filename",
                 vector_path=None,
                 reg=0,
                 optimizer_name="rmsprop",
                 class_names=None,
                 noise=0,
                 output_weights=None):

        # Initialize the model

        self.model = Sequential()

        # Import the numpy vectors
        try:
            movie_vectors = np.asarray(np.load(vector_path))
        except OSError:
            # If it fails, assume that it's in a standard format for vectors and then save it in numpy format
            movie_vectors = dt.importVectors(vector_path)
            movie_vectors = np.asarray(movie_vectors)
            np.save(file_name, movie_vectors)

        # Set the input and the output to be the same size, as this is an auto-encoder

        input_size = len(movie_vectors[0])
        output_size = len(movie_vectors[0])

        if noise > 0:  # If using a noisy autoencoder, add GaussianNoise layers to the start of the encoder
            self.model.add(GaussianNoise(noise, input_shape=(input_size, )))
            self.model.add(
                Dense(output_dim=hidden_layer_size,
                      input_dim=input_size,
                      init=layer_init,
                      activation=hidden_activation,
                      W_regularizer=l2(reg)))
        else:
            # Otherwise just add the hidden layer
            self.model.add(
                Dense(output_dim=hidden_layer_size,
                      input_dim=input_size,
                      init=layer_init,
                      activation=hidden_activation,
                      W_regularizer=l2(reg)))

        # If using custom weights on the hidden layer to the output layer, apply those custom weights. Otherwise just add output layer.
        if output_weights == None:
            self.model.add(
                Dense(output_dim=output_size,
                      init=layer_init,
                      activation=output_activation))
        else:
            self.model.add(
                Dense(output_dim=len(output_weights[0]),
                      init=layer_init,
                      activation=output_activation,
                      weights=output_weights))

        # Compile the model and fit it to the data
        if optimizer_name == "sgd":
            optimizer = SGD(lr=learn_rate, decay=decay)
        elif optimizer_name == "rmsprop":
            optimizer = RMSprop(lr=learn_rate)
        self.model.compile(loss=loss, optimizer=optimizer)
        self.model.fit(movie_vectors,
                       movie_vectors,
                       nb_epoch=epochs,
                       batch_size=batch_size,
                       verbose=1)

        # Create a truncated model that has no output layer that has the same weights as the previous model and use it to obtain the hidden layer representation
        truncated_model = Sequential()
        total_file_name = "newdata/spaces/" + file_name + ".mds"
        truncated_model.add(GaussianNoise(noise, input_shape=(input_size, )))
        truncated_model.add(
            Dense(output_dim=hidden_layer_size,
                  input_dim=input_size,
                  init=layer_init,
                  activation=hidden_activation,
                  W_regularizer=l2(reg)))
        truncated_model.compile(loss=loss, optimizer=optimizer)
        self.end_space = truncated_model.predict(movie_vectors)

        np.save(self.end_space, total_file_name)
    def createTermClusters(self, hv_directions, lv_directions, hv_names,
                           lv_names):
        least_similar_clusters = []
        least_similar_cluster_ids = []
        least_similar_cluster_names = []
        directions_to_add = []
        names_to_add = []

        print "Overall amount of HV directions: ", len(hv_directions)

        # Create high-valued clusters
        amt_of_clusters = len(hv_directions[0]) * 2
        for i in range(len(hv_directions)):
            if i == 0:
                least_similar_cluster_ids.append(i)
                least_similar_clusters.append(hv_directions[i])
                least_similar_cluster_names.append(hv_names[i])
                print "Least Similar Term", hv_names[i]
            elif i >= amt_of_clusters:
                directions_to_add.append(hv_directions[i])
                names_to_add.append(hv_names[i])
                print "Added", hv_names[
                    i], "To the remaining directions to add"
            else:
                ti = getNextClusterTerm(least_similar_clusters, hv_directions,
                                        least_similar_cluster_ids, 1)
                least_similar_cluster_ids.append(ti)
                least_similar_clusters.append(hv_directions[ti])
                least_similar_cluster_names.append(hv_names[ti])
                print str(i + 1) + "/" + str(
                    amt_of_clusters), "Least Similar Term", hv_names[ti]

        # Add remaining high value directions to the low value direction list
        directions_to_add.reverse()
        names_to_add.reverse()
        for i in range(len(directions_to_add)):
            lv_directions.insert(0, directions_to_add[i])
            lv_names.insert(0, names_to_add[i])

        # Initialize dictionaries for printing / visualizing
        cluster_name_dict = OrderedDict()
        for c in least_similar_cluster_names:
            cluster_name_dict[c] = []

        # For every low value direction, find the high value direction its most similar to and append it to the directions
        every_cluster_direction = []
        for i in least_similar_clusters:
            every_cluster_direction.append([i])

        # Reversing so that the top names and directions are first
        lv_names.reverse()
        lv_directions.reverse()

        # Finding the most similar directions to each cluster_centre
        # Creating a dictionary of {cluster_centre: [cluster_direction(1), ..., cluster_direction(n)]} pairs
        for d in range(len(lv_directions)):
            i = getXMostSimilarIndex(lv_directions[d], least_similar_clusters,
                                     [], 1)[0]
            every_cluster_direction[i].append(lv_directions[d])
            print str(d + 1) + "/" + str(
                len(lv_directions)), "Most Similar to", lv_names[
                    d], "Is", least_similar_cluster_names[i]
            cluster_name_dict[least_similar_cluster_names[i]].append(
                lv_names[d])

        # Mean of all directions = cluster direction
        cluster_directions = []
        for l in range(len(least_similar_clusters)):
            cluster_directions.append(
                dt.mean_of_array(every_cluster_direction[l]))
        """
        # Get the 10 most similar and least similar directions to save later
        most_similar = []
        least_similar = []
        most_similar_indexes = []
        least_similar_indexes = []
        indexes_to_find = []
        for k in sorted(cluster_amt_dict, key=cluster_amt_dict.get, reverse=True):
            name_to_get_most_similar = k
            index_to_find = dt.getIndexInArray(hv_names, name_to_get_most_similar)
            amt = 10
            indexes_to_find.append(index_to_find)
            most_similar_index = getXMostSimilarIndex(hv_directions[index_to_find], hv_directions, [index_to_find], amt)
            least_similar_index = getXLeastSimilarIndex(hv_directions[index_to_find], hv_directions, [index_to_find], amt)
            most_similar_indexes.append(most_similar_index)
            least_similar_indexes.append(least_similar_index)
        for m in range(len(most_similar_indexes)):
            line_to_append = []
            for v in range(len(most_similar_indexes[m])):
                line_to_append.append(hv_names[most_similar_indexes[m][v]])
            most_similar.append([cluster_dict_names[m][0], line_to_append])
        for l in range(len(least_similar_indexes)):
            line_to_append = []
            for v in range(len(least_similar_indexes[l])):
                line_to_append.append(hv_names[least_similar_indexes[l][v]])
            least_similar.append([cluster_dict_names[l][0], line_to_append])
        """

        return least_similar_cluster_names, cluster_name_dict, cluster_directions
Exemple #44
0
    def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn,
                 fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)

        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        scores = []
        pvalues = []
        scores_kendall = []
        pvalues_kendall = []
        counter = 0
        averages = []
        with open(direction_fn) as f:
            for line in f:
                if indexes_to_get is not []:
                    for i in indexes_to_get:
                        if i == counter:
                            total = 0
                            amt = 0
                            direction = line.split()
                            for d in range(len(direction)):
                                direction[d] = float(direction[d])
                            new_direction = []
                            new_ppmi = []
                            direction_rank = np.argsort(direction)
                            ppmi_rank = np.argsort(ppmi[counter])
                            for d in range(len(ppmi[counter])):
                                if ppmi[counter][d] != 0:
                                    total += ppmi[counter][d]
                                    amt += 1
                                    new_direction.append(direction_rank[d])
                                    new_ppmi.append(ppmi_rank[d])
                            average = total / amt
                            rho, pvalue = spearmanr(new_ppmi, new_direction)
                            scores.append(rho)
                            pvalues.append(pvalue)
                            scores_kendall.append(rhok)
                            pvalues_kendall.append(pvaluek)
                            averages.append(average)
                            print phrases[counter] + ":", rho, pvalue, average
                else:
                    direction = line.split()
                    for d in range(len(direction)):
                        direction[d] = float(direction[d])
                    direction_rank = np.argsort(direction)
                    ppmi_rank = np.argsort(ppmi[counter])
                    rho, pvalue = spearmanr(direction_rank, ppmi_rank)
                    scores.append(rho)
                    pvalues.append(pvalue)
                    print phrases[counter] + ":", rho, pvalue
                counter += 1
        dt.write1dArray(scores, "RuleType/s" + fn + ".score")
        dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue")
        dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score")
        dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue")
        dt.write1dArray(phrases, "RuleType/" + fn + ".names")
        dt.write1dArray(averages, "RuleType/" + fn + ".averages")
    def __init__(self, direction_fn, ppmi_fn, phrases_fn, phrases_to_check_fn,
                 fn):
        ppmi = dt.importLabels(ppmi_fn)
        ppmi = np.asarray(ppmi)
        phrases = dt.importString(phrases_fn)

        indexes_to_get = []
        if phrases_to_check_fn != "":
            phrases_to_check = dt.importString(phrases_to_check_fn)
            for pc in range(len(phrases_to_check)):
                for p in range(len(phrases)):
                    if phrases_to_check[pc] == phrases[p][6:]:
                        indexes_to_get.append(p)
        indexes_to_get.sort()
        ppmi = ppmi.transpose()
        print len(ppmi), len(ppmi[0])
        scores = []
        pvalues = []
        scores_kendall = []
        pvalues_kendall = []
        agini = []
        agini1 = []
        angini1 = []
        angini = []
        amap = []
        andcg = []
        counter = 0
        averages = []
        with open(direction_fn) as f:
            for line in f:
                exists = True
                if phrases_to_check_fn != "":
                    exists = False
                    for i in indexes_to_get:
                        if i == counter:
                            exists = True
                            break
                if exists:
                    total = 0
                    amt = 0
                    direction = line.split()
                    for d in range(len(direction)):
                        direction[d] = float(direction[d])
                    new_direction = []
                    new_ppmi = []
                    direction_rank = np.argsort(direction)
                    ppmi_rank = np.argsort(ppmi[counter])
                    for d in range(len(ppmi[counter])):
                        if ppmi[counter][d] != 0:
                            total += ppmi[counter][d]
                            amt += 1
                            new_direction.append(direction_rank[d])
                            new_ppmi.append(ppmi_rank[d])
                    average = total / amt

                    min_max_scaler = preprocessing.MinMaxScaler()
                    normalized_ppmi = min_max_scaler.fit_transform(
                        ppmi[counter])
                    normalized_dir = min_max_scaler.fit_transform(direction)

                    ginis = gini(normalized_ppmi, normalized_dir)

                    ranked_ppmi = dt.sortByArray(new_ppmi, new_direction)
                    nr_ppmi = min_max_scaler.fit_transform(ranked_ppmi)
                    ndcgs = ndcg_at_k(nr_ppmi, len(nr_ppmi))

                    #binarizer = preprocessing.Binarizer()
                    #binary_ppmi = binarizer.transform(normalized_ppmi)
                    #normalized_dir = np.ndarray.tolist(normalized_dir)
                    map = 0  #average_precision_score(normalized_ppmi, normalized_dir)

                    rho, pvalue = spearmanr(new_ppmi, new_direction)
                    rhok, pvaluek = kendalltau(new_ppmi, new_direction)

                    scores.append(rho)
                    pvalues.append(pvalue)
                    scores_kendall.append(rhok)
                    pvalues_kendall.append(pvaluek)
                    andcg.append(ndcgs)
                    agini.append(ginis)
                    amap.append(map)
                    averages.append(average)
                    print phrases[counter] + ":", map, ginis

                counter += 1
        dt.write1dArray(scores, "RuleType/s" + fn + ".score")
        dt.write1dArray(pvalues, "RuleType/s" + fn + ".pvalue")
        dt.write1dArray(scores_kendall, "RuleType/k" + fn + ".score")
        dt.write1dArray(pvalues_kendall, "RuleType/k" + fn + ".pvalue")
        dt.write1dArray(phrases, "RuleType/" + fn + ".names")
        dt.write1dArray(averages, "RuleType/" + fn + ".averages")
        dt.write1dArray(agini, "RuleType/gn" + fn + ".score")
        dt.write1dArray(andcg, "RuleType/ndcg" + fn + ".score")
        dt.write1dArray(amap, "RuleType/map" + fn + ".score")