def getScoreDifferences(name_word_file1, name_score_file1, name_word_file2,
                        name_score_file2, name, data_type):

    scores1 = import1dArray(name_score_file1, "f")
    scores2 = import1dArray(name_score_file2, "f")

    words1 = import1dArray(name_word_file1, "s")
    words2 = import1dArray(name_word_file2, "s")

    differences_list = []
    if len(words1) > len(words2):
        same_element_index = getIndexOfCommonElements(words2, words1)
        scores1 = np.asarray(scores1)[same_element_index]
        words1 = np.asarray(words1)[same_element_index]
    else:
        same_element_index = getIndexOfCommonElements(words1, words2)
        scores2 = np.asarray(scores2)[same_element_index]
        words2 = np.asarray(words2)[same_element_index]

    for i in range(len(scores1)):
        differences_list.append(scores1[i] - scores2[i])
    most_different_words = [
        x for (y, x) in sorted(zip(differences_list, words1))
    ]
    differences_list = sorted(differences_list)
    write1dArray(
        most_different_words, "../data/" + data_type +
        "/SVM/difference/most_different_words_" + name + ".txt")
    write1dArray(
        differences_list, "../data/" + data_type +
        "/SVM/difference/most_different_values_" + name + ".txt")
def getDifference(array1, array2):
    file2 = import1dArray(array1)
    file1 = import1dArray(array2)
    for line1 in file1:
        found = False
        for line2 in file2:
            if line2 == line1:
                found = True
                break
        if not found:
            print(line1)
def writeClassAll(class_fn, full_phrases_fn, phrases_used_fn, file_name):
    full_phrases = import1dArray(full_phrases_fn)
    #ppmi = np.asarray(import2dArray(class_fn)).transpose()
    ppmi = import2dArray(class_fn)
    new_ppmi = []
    phrases_used = import1dArray(phrases_used_fn)
    for p in range(len(full_phrases)):
        for pi in range(len(phrases_used)):
            if full_phrases[p] == phrases_used[pi]:
                new_ppmi.append(ppmi[p])
                break
    write2dArray(new_ppmi, file_name)
def getScores(names, full_scores, full_names, file_name, data_type):
    full_scores = import1dArray(full_scores)
    full_names = import1dArray(full_names)
    names = import1dArray(names)
    final_scores = []
    for j in range(len(names)):
        for i in range(len(full_names)):
            if names[j] == full_names[i]:
                final_scores.append(full_scores[i])
                break
    write1dArray(final_scores,
                 "../data/" + data_type + "/bow/scores/" + file_name + ".txt")
    return "../data/" + data_type + "/bow/scores/" + file_name + ".txt"
def compileSVMResults(file_name, chunk_amt, data_type):
    if fileExists("../data/" + data_type + "/svm/directions/" + file_name +
                  ".txt") is False:
        print("Compiling SVM results")
        randomcount = 0
        directions = []
        for c in range(chunk_amt):
            directions.append("../data/" + data_type + "/svm/directions/" +
                              file_name + " CID" + str(c) + " CAMT" +
                              str(chunk_amt) + ".txt")
        kappa = []
        for c in range(chunk_amt):
            kappa.append("../data/" + data_type + "/svm/kappa/" + file_name +
                         " CID" + str(c) + " CAMT" + str(chunk_amt) + ".txt")
        for f in directions:
            while not fileExists(f):
                time.sleep(10)
        time.sleep(10)
        di = []
        for d in directions:
            di.extend(import2dArray(d))
        ka = []
        for k in kappa:
            ka.extend(import1dArray(k))
        write2dArray(
            di,
            "../data/" + data_type + "/svm/directions/" + file_name + ".txt")
        write1dArray(
            ka, "../data/" + data_type + "/svm/kappa/" + file_name + ".txt")
    else:
        print("Skipping compile")
def convertToPPMI(freq_arrays_fn, term_names_fn):
    freq_arrays = np.asarray(import2dArray(freq_arrays_fn, "i"))
    term_names = import1dArray(term_names_fn)
    ppmi_arrays = []
    overall = 0.0
    for f in freq_arrays:
        overall += sum(f)
    entity_array = [0] * 15000
    # For each term
    for t in range(len(freq_arrays)):
        ppmi_array = []
        term = sum(freq_arrays[t, :])
        term_p = term / overall
        for e in range(len(freq_arrays[t])):
            ppmi = 0.0
            freq = freq_arrays[t][e]
            if freq != 0:
                freq_p = freq / overall
                if entity_array[e] == 0:
                    entity = sum(freq_arrays[:, e])
                    entity_p = entity / overall
                    entity_array[e] = entity_p
                proba = freq_p / (entity_array[e] * term_p)
                ppmi = np.amax([0.0, np.log(proba)])
            ppmi_array.append(ppmi)
        print(ppmi_array)
        ppmi_arrays.append(ppmi_array)
        write1dArray(ppmi_array,
                     "../data/movies/bow/ppmi/class-" + term_names[t])
    write2dArray(ppmi_arrays, "../data/movies/bow/ppmi/class-all")
def getWordVectors(vector_save_fn, words_fn, wvn, wv_amt, svm_dir_fn=None):
    if os.path.exists(vector_save_fn) is False:
        glove_file = datapath('/home/tom/Downloads/glove.6B/glove.6B.' +
                              str(wv_amt) + 'd.txt')
        tmp_file = get_tmpfile(
            "/home/tom/Downloads/glove.6B/test_word2vec.txt")
        glove2word2vec(glove_file, tmp_file)
        svm_dir = import2dArray(svm_dir_fn)
        all_vectors = KeyedVectors.load_word2vec_format(tmp_file)
        vectors = []

        words = import1dArray(words_fn)
        for w in range(len(words)):
            try:
                if svm_dir_fn is None:
                    vectors.append(all_vectors.get_vector(words[w]))
                else:
                    vectors.append(
                        np.concatenate(
                            [all_vectors.get_vector(words[w]), svm_dir[w]]))
            except KeyError:
                if svm_dir_fn is None:
                    vectors.append(np.zeros(wv_amt))
                else:
                    vectors.append(np.zeros(wv_amt + len(svm_dir[0])))

        write2dArray(vectors, vector_save_fn)

        write1dArray(words, wvn)
    else:
        print("Already got word vectors", vector_save_fn)
def countClassFrequences(data_type, class_name):
    class_all = import2dArray("../data/" + data_type + "/classify/" +
                              class_name + "/class-all")
    class_names = import1dArray("../data/" + data_type + "/classify/" +
                                class_name + "/names.txt")
    counts = []
    class_all = np.asarray(class_all).transpose()
    for i in range(len(class_all)):
        count = len(np.nonzero(class_all[i])[0])
        print(class_names[i], count)
        counts.append(count)
def obtainKappaOnClusteredDirection(names, ranks):
    # For each discrete rank, obtain the Kappa score compared to the word occ
    kappas = np.empty(len(names))
    for n in range(len(names)):
        clf = svm.LinearSVC()
        ppmi = np.asarray(
            import1dArray("../data/movies/bow/binary/phrases/" + names[n],
                          "i"))
        clf.fit(ranks, ppmi)
        y_pred = clf.predict(ranks)
        score = cohen_kappa_score(ppmi, y_pred)
        kappas[n] = score
    return kappas
def getNonZero(class_names_fn, file_name):
    class_names = import1dArray(class_names_fn, "s")
    class_all = np.asarray(import2dArray(file_name)).transpose()
    for c in range(len(class_all)):
        print(np.count_nonzero(class_all[c]))
def writeIndividualClasses(overall_class_fn, names_fn, output_filename):
    overall_class = import2dArray(overall_class_fn, "f")
    names = import1dArray(names_fn)
    for n in range(len(names)):
        write1dArray(overall_class[n], output_filename + "class-" + names[n])
        print(names[n])
def remove_indexes(indexes, array_fn):
    array = np.asarray(import1dArray(array_fn))
    array = np.delete(array, indexes, axis=0)
    write1dArray(array, array_fn)
    print("wrote", array_fn)
    return classes.transpose(), class_names


"""

"""
if __name__ == '__main__':
    """
    #countClassFrequences("reuters", "topics")
    class_fn = "../data/movies/classify/keywords/class-all"
    class_name_fn = "../data/movies/classify/keywords/names.txt"
    classes = import2dArray(class_fn)
    class_names = import1dArray(class_name_fn)
    classes, class_names = removeInfrequent(classes, class_names)
    """
    words = import1dArray("../data/placetypes/bow/names/5-1-all.txt", "s")
    word_dict = {}
    for i in range(len(words)):
        word_dict[i] = words[i]

    averageWordVectorsFreq(
        word_dict,
        "../data/placetypes/bow/frequency/phrases/class-all-5-1-all", 200,
        "placetypes")
    averageWordVectorsFreq(
        word_dict,
        "../data/placetypes/bow/frequency/phrases/class-all-5-1-all", 100,
        "placetypes")
    averageWordVectorsFreq(
        word_dict,
        "../data/placetypes/bow/frequency/phrases/class-all-5-1-all", 50,
Esempio n. 14
0
from util import io


def parameter_list_to_dict_str(parameter_list_string):  #
    dict_str = ["param_dict = {"]
    for i in range(len(parameter_list_string)):
        str = ""
        if parameter_list_string[i][:1] == "#":
            continue
        else:
            split = parameter_list_string[i].split()
            if len(split) == 0:
                continue
            str += "\t'" + split[0] + "': " + split[0] + ","
        dict_str.append(str)
    dict_str.append("}")
    return dict_str


if __name__ == '__main__':
    parameter_list_string = io.import1dArray(
        "../../data/parameter_list_string.txt")
    parameter_dict = parameter_list_to_dict_str(parameter_list_string)
    io.write1dArray(parameter_dict, "../../data/parameter_dict.txt")