Ejemplo n.º 1
0
def selectCutOffByWordVector(cutoff_fn, cluster_dict_fn, file_name):
    cutoff = dt.import2dArray(cutoff_fn)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    cutoff_words = []
    wv, wvn = dt.getWordVectors()
    cluster_boundary = 2
    cluster_dict_arrays = []
    for key, value in cluster_dict.items():
        cluster_array = []
        cluster_array.append(key)
        for v in value:
            cluster_array.append(v)
        cluster_dict_arrays.append(cluster_array)
    for c in range(len(cutoff)):
        clusters = []
        for i in range(len(cutoff[c])):
            cluster = []
            for x in range(len(cutoff[c])-1, -1, -1):
                if cutoff[c][x] is None or cutoff[c][i] is None:
                    continue
                if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary:
                    cluster.append(cluster_dict_arrays[c][x])
                    cutoff[c][x] = None
                    cluster_dict_arrays[c][x] = None
            if cluster is []:
                continue
            clusters.append(cluster)
        # Get the maximum similarity word vector value for each cluster, across all clusters
        for cl in range(len(clusters)):
            for wa in range(len(clusters[cl])):
                for w in range(len(clusters[cl][wa])):
                    clusters[cl[wa]]


    dt.write2dArray(cutoff_words, "../data/movies/rules/cutoff/"+file_name+"WVN.txt")
Ejemplo n.º 2
0
def getAllPhraseRankings(directions_fn=None, vectors_fn=None, property_names_fn=None, vector_names_fn=None, fn="no filename",
                         percentage_increment=1, scores_fn = None, top_amt=0, discrete=False, data_type="movies",
                 rewrite_files=False):
    rankings_fn_all = "../data/" + data_type + "/rank/numeric/" + fn + "ALL.txt"

    all_fns = [rankings_fn_all]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", "getAllPhraseRankings")
        return
    else:
        print("Running task", "getAllPhraseRankings")

    directions = dt.import2dArray(directions_fn)
    vectors = dt.import2dArray(vectors_fn)
    property_names = dt.import1dArray(property_names_fn)
    vector_names = dt.import1dArray(vector_names_fn)
    if top_amt != 0:
        scores = dt.import1dArray(scores_fn, "f")
        directions = dt.sortByReverseArray(directions, scores)[:top_amt]
        property_names = dt.sortByReverseArray(property_names, scores)[:top_amt]

    rankings = getRankings(directions, vectors, property_names, vector_names)
    if discrete:
        discrete_labels = createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
    for a in range(len(rankings)):
        rankings[a] = np.around(rankings[a], decimals=4)
    #dt.write1dArray(property_names, "../data/movies/bow/names/top5kof17k.txt")

    dt.write2dArray(rankings, rankings_fn_all)
Ejemplo n.º 3
0
def fixCutoffFormatting(cutoff_fn, file_name):
    cutoff = dt.import1dArray(cutoff_fn)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    for c in range(len(cutoff)):
        cutoff[c] = cutoff[c].split()
        for i in range(len(cutoff[c])):
            cutoff[c][i] = int(dt.stripPunctuation(cutoff[c][i]))
    dt.write2dArray(cutoff, "../data/movies/rules/cutoff/" +file_name+ ".txt")
def saveClusters(directions_fn,
                 scores_fn,
                 names_fn,
                 filename,
                 amt_of_dirs,
                 data_type,
                 cluster_amt,
                 rewrite_files=False,
                 algorithm="meanshift_k"):

    dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt"
    cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt"

    all_fns = [dict_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", saveClusters.__name__)
        return
    else:
        print("Running task", saveClusters.__name__)

    p_dir = dt.import2dArray(directions_fn)
    p_names = dt.import1dArray(names_fn, "s")
    p_scores = dt.import1dArray(scores_fn, "f")

    ids = np.argsort(p_scores)

    p_dir = np.flipud(p_dir[ids])[:amt_of_dirs]
    p_names = np.flipud(p_names[ids])[:amt_of_dirs]
    if algorithm == "meanshift":
        labels = meanShift(p_dir)
    else:
        labels = kMeans(p_dir, cluster_amt)
    unique, counts = np.unique(labels, return_counts=True)

    clusters = []
    dir_clusters = []
    for i in range(len(unique)):
        clusters.append([])
        dir_clusters.append([])
    for i in range(len(labels)):
        clusters[labels[i]].append(p_names[i])
        dir_clusters[labels[i]].append(p_dir[i])
    cluster_directions = []
    for l in range(len(dir_clusters)):
        cluster_directions.append(dt.mean_of_array(dir_clusters[l]))

    print("------------------------")
    for c in clusters:
        print(c)
    print("------------------------")

    dt.write2dArray(clusters, dict_fn)
    dt.write2dArray(cluster_directions, cluster_directions_fn)
def main(data_type, clf, highest_amt, lowest_amt, depth, rewrite_files):

    min = lowest_amt
    max = highest_amt
    dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dm"
    dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dmround"
    mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "d" + str(depth)
    svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "round"

    term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf
    if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]):
        print("all files exist")
        exit()
    if dt.fileExists(dm_fn) is False:
        newsgroups_train = fetch_20newsgroups(subset='train', shuffle=False)
        newsgroups_test = fetch_20newsgroups(subset='test', shuffle=False)

        vectors = np.concatenate((newsgroups_train.data, newsgroups_test.data),
                                 axis=0)
        newsgroups_test = None
        newsgroups_train = None
        # Get sparse tf rep
        tf_vectorizer = CountVectorizer(max_df=highest_amt,
                                        min_df=lowest_amt,
                                        stop_words='english')
        print("completed vectorizer")
        tf = tf_vectorizer.fit_transform(vectors)
        vectors = None
        # Get sparse PPMI rep from sparse tf rep
        print("done ppmisaprse")
        sparse_ppmi = convertPPMISparse(tf)
        # Get sparse Dsim matrix from sparse PPMI rep
        dm = getDissimilarityMatrixSparse(sparse_ppmi)
        dt.write2dArray(dm, dm_fn)
    else:
        dm = dt.import2dArray(dm_fn)
    print("starting mds")
    # Use as input to mds
    mds = createMDS(dm, depth)
    # save MDS
    dt.write2dArray(mds, mds_fn)
Ejemplo n.º 6
0
def selectCutOffByExplanation(cutoff_fn, cluster_dict_fn, file_name):
    cutoff = dt.import2dArray(cutoff_fn)
    dupe_cutoff = copy.deepcopy(cutoff)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    cutoff_words = []
    cluster_boundary = 2
    cluster_dict_arrays = []
    for key, value in cluster_dict.items():
        cluster_array = []
        cluster_array.append(key)
        for v in value:
            cluster_array.append(v)
        cluster_dict_arrays.append(cluster_array)
    explanations = []
    explanation_cutoffs = []
    for c in range(len(cutoff)):
        clusters = []
        for i in range(len(cutoff[c])):
            cluster = []
            for x in range(len(cutoff[c])-1, -1, -1):
                if cutoff[c][x] is None or cutoff[c][i] is None:
                    continue
                if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary:
                    cluster.append(cluster_dict_arrays[c][x])
                    cutoff[c][x] = None
                    cluster_dict_arrays[c][x] = None
            if cluster is []:
                continue
            clusters.append(cluster)
        # Get the m  vvcaximum similarity word vector value for each cluster, across all clusters
        # For each cluster
        explained_cutoff = []
        explained_cutoff_value = []
        for cl in range(len(clusters)):
            if len(clusters[cl]) == 0:
                print ("Skipped")
                continue
            cluster_explanation, winning_index = webapi.getHighestScore(clusters[cl])
            explained_cutoff.append(cluster_explanation+",")

            dict_index = 0
            for h in range(len(cluster_dict_arrays[cl])):
                if cluster_dict_arrays[cl][h] == clusters[cl][winning_index]:
                    dict_index = h
            explained_cutoff_value.append(dupe_cutoff[cl][dict_index])
        explanations.append(explained_cutoff)
        explanation_cutoffs.append(explained_cutoff_value)
    dt.write2dArray(explanations, "../data/movies/rules/final_names/"+file_name+"WVN.txt")
    dt.write2dArray(explanation_cutoffs, "../data/movies/rules/final_cutoff/"+file_name+".txt")
Ejemplo n.º 7
0
def makePPMI(names_fn, scores_fn, amt, data_type, ppmi_fn, name_fn):
    scores = np.asarray(dt.import1dArray(scores_fn, "f"))
    names = np.asarray(dt.import1dArray(names_fn))

    names = names[np.flipud(np.argsort(scores))][:amt]
    if dt.allFnsAlreadyExist([ppmi_fn, name_fn]) is False:
        ppmi_file = []
        for name in names:
            ppmi_file.append(
                dt.import1dArray("../data/" + data_type + "/bow/ppmi/" +
                                 "class-" + name + "-100-10-all"))
        dt.write2dArray(ppmi_file, ppmi_fn)
        dt.write1dArray(names, name_fn)
    else:
        print("already_made PPMI of this size")
def getDissimilarityMatrixSparse(tf):
    tflen = tf.shape[0]
    dm = np.empty([tflen, tflen], dtype="float64")
    pithing = 2 / pi
    norms = np.empty(tflen, dtype="float64")

    #Calculate norms
    for ei in range(tflen):
        norms[ei] = spl.norm(tf[ei])
        print("norm", ei)

    dot_product = np.zeros([tflen, tflen], dtype="float64")

    use_old_dp = True
    if use_old_dp:
        dot_product = dt.import2dArray("dotproduct.temp")
    else:
        #Calculate dot products
        for ei in range(tflen):
            for ej in range(tflen):
                if dot_product[ej][ei] != 0:
                    dot_product[ei][ej] = dot_product[ej][ei]
                    continue
                dot_product[ei][ej] = tf[ei].dot(tf[ej].T)[0, 0]
            print("dp", ei)
        dt.write2dArray(dot_product, "dotproduct.temp")

    norm_multiplied = np.empty([tflen, tflen], dtype="float64")

    # Calculate dot products
    for ei in range(tflen):
        for ej in range(tflen):
            norm_multiplied[ei][ej] = norms[ei] * norms[ej]
        print("dp", ei)

    norm_multiplied = dt.shortenFloatsNoFn(norm_multiplied)
    dot_product = dt.shortenFloatsNoFn(dot_product)

    #Get angular differences
    for ei in range(tflen):
        for ej in range(tflen):
            ang = pithing * np.arccos(
                dot_product[ei][ej] / norm_multiplied[ei][ej])
            dm[ei][ej] = ang
        print(ei)
    return dm
Ejemplo n.º 9
0
def getCutOff(cluster_dict_fn,  rankings_fn, file_name):

    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    rankings = dt.importDiscreteVectors(rankings_fn)

    for r in rankings:
        for a in range(len(r)):
            r[a] = int(r[a][:-1])

    cutoff_clusters = []
    counter = 0
    for key, value in cluster_dict.items():
        value.insert(0, key)
        cutoffs = []
        for v in value:
            max_score = 0
            cutoff = 0
            for i in range(1, 101):
                y_pred = []
                for ve in range(len(rankings[counter])):
                    rank = rankings[counter][ve]
                    if rank > i:
                        y_pred.append(0)
                    else:
                        y_pred.append(1)
                y_test = dt.import2dArray("../data/movies/bow/frequency/phrases/class-"+v, "s")
                score = cohen_kappa_score(y_test, y_pred)
                print(v, int(i), "Score", score)
                if score > max_score:
                    max_score = score
                    cutoff = i
            cutoffs.append(cutoff)
            print("Cutoff for", v, "On", key, "Was", str(cutoff))
        cutoff_clusters.append(cutoffs)
        counter+=1
    dt.write2dArray(cutoff_clusters, "../data/movies/rules/cutoff/"+file_name+".txt")
Ejemplo n.º 10
0
def getAllRankings(directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, percent, percentage_increment, by_vector, fn, discrete=True, data_type="movies",
                 rewrite_files=False):

    #labels_fn = "../data/"+data_type+"/rank/labels/" + fn + ".txt"
    rankings_fn = "../data/"+data_type+"/rank/numeric/" + fn + ".txt"
    #discrete_labels_fn = "../data/"+data_type+"/rank/discrete/" + fn + ".txt"

    all_fns = [rankings_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        for f in all_fns:
            print(f, "Already exists")
        print("Skipping task", "getAllRankings")
        return
    else:
        print("Running task", "getAllRankings")

    directions = dt.import2dArray(directions_fn)
    vectors = dt.import2dArray(vectors_fn)
    cluster_names = dt.import1dArray(cluster_names_fn)
    vector_names = dt.import1dArray(vector_names_fn)
    rankings = getRankings(directions, vectors, cluster_names, vector_names)
    rankings = np.asarray(rankings)
    if discrete:
        labels = createLabels(rankings, percent)
        labels = np.asarray(labels)
        discrete_labels = createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
    if by_vector:
        labels = labels.transpose()
        if discrete:
            discrete_labels = discrete_labels.transpose()
        rankings = rankings.transpose()
    if discrete:
        dt.write2dArray(labels, labels_fn)

    dt.write2dArray(rankings, rankings_fn)
    if discrete:
        dt.write2dArray(discrete_labels, discrete_labels_fn)
Ejemplo n.º 11
0
    def __init__(self,
                 vector_path,
                 class_path,
                 property_names_fn,
                 file_name,
                 svm_type,
                 training_size=10000,
                 lowest_count=200,
                 highest_count=21470000,
                 get_kappa=True,
                 get_f1=True,
                 single_class=True,
                 data_type="movies",
                 getting_directions=True,
                 threads=1,
                 chunk_amt=0,
                 chunk_id=0,
                 rewrite_files=False,
                 classification="all",
                 loc="../data/",
                 logistic_regression=False,
                 sparse_array_fn=None,
                 only_these_fn=None):

        self.get_kappa = True
        self.get_f1 = get_f1
        self.data_type = data_type
        self.classification = classification
        self.lowest_amt = lowest_count
        self.higher_amt = highest_count

        if chunk_amt > 0:
            file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str(
                chunk_amt)

        directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt"
        ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt"
        kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt"
        acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt"
        TP_fn = loc + data_type + "/svm/stats/TP " + file_name + ".txt"
        FP_fn = loc + data_type + "/svm/stats/FP " + file_name + ".txt"
        TN_fn = loc + data_type + "/svm/stats/TN " + file_name + ".txt"
        FN_fn = loc + data_type + "/svm/stats/FN " + file_name + ".txt"

        all_fns = [directions_fn, kappa_fn]
        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "getSVMResults")
            return
        else:
            print("Running task", "getSVMResults")

        y_train = 0
        y_test = 0
        vectors = np.asarray(dt.import2dArray(vector_path))
        print("imported vectors")
        if not getting_directions:
            classes = np.asarray(dt.import2dArray(class_path))
            print("imported classes")

        property_names = dt.import1dArray(property_names_fn)
        print("imported propery names")
        if chunk_amt > 0:
            if chunk_id == chunk_amt - 1:
                chunk = int(len(property_names) / chunk_amt)
                multiply = chunk_amt - 1
                property_names = property_names[chunk * multiply:]
            else:
                property_names = dt.chunks(
                    property_names, int(
                        (len(property_names) / chunk_amt)))[chunk_id]

        if sparse_array_fn is not None:
            sparse_array = dt.import2dArray(sparse_array_fn)
        else:
            sparse_array = None

        if sparse_array is not None:
            for s in range(len(sparse_array)):
                if len(np.nonzero(sparse_array[s])[0]) <= 1:
                    print("WILL FAIL", s, len(np.nonzero(sparse_array[s])[0]))
                else:
                    print(len(np.nonzero(sparse_array[s])[0]))

        if not getting_directions:
            x_train, x_test, y_train, y_test = train_test_split(vectors,
                                                                classes,
                                                                test_size=0.3,
                                                                random_state=0)
        else:
            x_train = vectors
            x_test = vectors

        if get_f1:
            y_train = y_train.transpose()
            y_test = y_test.transpose()
            print("transpoosed")
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

        if only_these_fn is not None:
            only_these = dt.import1dArray(only_these_fn, "s")
            inds = []
            for s in range(len(property_names)):
                for o in only_these:
                    if property_names[s] == o:
                        inds.append(s)
                        break
            sparse_array = sparse_array[inds]
            property_names = property_names[inds]

        if self.get_f1 is False:
            print("running svms")
            kappa_scores, directions, f1_scores, property_names, accs, TPs, FPs, TNs, FNs = self.runAllSVMs(
                y_test, y_train, property_names, file_name, svm_type,
                getting_directions, threads, logistic_regression, sparse_array)

            dt.write1dArray(kappa_scores, kappa_fn)
            dt.write2dArray(directions, directions_fn)
            dt.write1dArray(f1_scores, ktau_scores_fn)
            dt.write1dArray(accs, acc_fn)
            dt.write1dArray(TPs, TP_fn)
            dt.write1dArray(FPs, FP_fn)
            dt.write1dArray(TNs, TN_fn)
            dt.write1dArray(FNs, FN_fn)
            dt.write1dArray(property_names,
                            property_names_fn + file_name + ".txt")
        else:
            final_f1 = []
            final_acc = []
            for y in range(len(y_train)):
                f1, acc = self.runClassifySVM(y_test[y], y_train[y])
                print(f1, acc)
                final_f1.append(f1)
                final_acc.append(acc)
            dt.write1dArray(final_f1, ktau_scores_fn)
            dt.write1dArray(final_acc, acc_fn)
Ejemplo n.º 12
0
def getClusters(directions_fn,
                scores_fn,
                names_fn,
                is_gini,
                amt_high_directions,
                amt_low_directions,
                filename,
                amt_of_clusters,
                high_threshold,
                low_threshold,
                data_type,
                rewrite_files=False,
                half_kappa_half_ndcg="",
                dont_cluster=0):

    cluster_names_fn = "../data/" + data_type + "/cluster/first_terms/" + filename + ".txt"
    clusters_fn = "../data/" + data_type + "/cluster/first_term_clusters/" + filename + ".txt"
    dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt"
    cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt"

    all_fns = [cluster_names_fn, clusters_fn, dict_fn, cluster_directions_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", getClusters.__name__)
        return
    else:
        print("Running task", getClusters.__name__)

    hdn, ldn, hd, ld = splitDirections(directions_fn, scores_fn, names_fn,
                                       is_gini, amt_high_directions,
                                       amt_low_directions, high_threshold,
                                       low_threshold, half_kappa_half_ndcg)

    if amt_low_directions != amt_of_clusters:
        cluster_directions, least_similar_cluster_names, cluster_name_dict, least_similar_clusters = createTermClusters(
            hd, ld, hdn, ldn, amt_of_clusters, dont_cluster)
    else:
        least_similar_clusters = hd
        cluster_directions = hd
        least_similar_cluster_names = hdn
        cluster_name_dict = OrderedDict()
        for n in hdn:
            cluster_name_dict[n] = ""

    #word_vector_names = nameClustersMedoid(cluster_name_dict)
    additional_text = ""
    #if is_gini:
    #    additional_text = "gini"
    """
    directions = np.asarray(dt.import2dArray(directions_fn))
    names = np.asarray(dt.import1dArray(names_fn))

    least_similar_cluster_names.extend(hdn)
    least_similar_cluster_names.extend(ldn)
    least_similar_clusters.extend(hd)
    least_similar_clusters.extend(ld)
    cluster_center_directions.extend(ld)
    cluster_center_directions.extend(directions)
    """
    dt.write1dArray(least_similar_cluster_names, cluster_names_fn)
    dt.write2dArray(least_similar_clusters, clusters_fn)
    dt.writeArrayDict(cluster_name_dict, dict_fn)
    #dt.write1dArray(word_vector_names, word_vector_names_fn)
    dt.write2dArray(cluster_directions, cluster_directions_fn)
Ejemplo n.º 13
0
    def __init__(self,
                 class_path=None,
                 get_scores=False,
                 randomize_finetune_weights=False,
                 dropout_noise=None,
                 amount_of_hidden=0,
                 epochs=1,
                 learn_rate=0.01,
                 loss="mse",
                 batch_size=1,
                 past_model_bias_fn=None,
                 identity_swap=False,
                 reg=0.0,
                 amount_of_finetune=[],
                 output_size=25,
                 hidden_activation="tanh",
                 layer_init="glorot_uniform",
                 output_activation="tanh",
                 deep_size=None,
                 corrupt_finetune_weights=False,
                 split_to_use=-1,
                 hidden_layer_size=100,
                 file_name="unspecified_filename",
                 vector_path=None,
                 is_identity=False,
                 finetune_size=0,
                 data_type="movies",
                 optimizer_name="rmsprop",
                 noise=0.0,
                 fine_tune_weights_fn=None,
                 past_model_weights_fn=None,
                 from_ae=True,
                 save_outputs=False,
                 label_names_fn="",
                 rewrite_files=False,
                 cv_splits=1,
                 cutoff_start=0.2,
                 development=False,
                 class_weight=None,
                 csv_fn=None,
                 tune_vals=False,
                 get_nnet_vectors_path=None,
                 classification_name="all",
                 limit_entities=False,
                 limited_label_fn="",
                 vector_names_fn="",
                 identity_activation="linear",
                 loc="../data/",
                 lock_weights_and_redo=False):

        weights_fn = loc + data_type + "/nnet/weights/" + file_name + "L0.txt"
        bias_fn = loc + data_type + "/nnet/bias/" + file_name + "L0.txt"
        rank_fn = loc + data_type + "/nnet/clusters/" + file_name + ".txt"

        all_fns = [weights_fn, bias_fn, rank_fn]
        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "nnet")
            return
        else:

            print("Running task", "nnet")

        self.class_path = class_path
        self.learn_rate = learn_rate
        self.epochs = epochs
        self.loss = loss
        self.batch_size = batch_size
        self.hidden_activation = hidden_activation
        self.layer_init = layer_init
        self.output_activation = output_activation
        self.hidden_layer_size = hidden_layer_size
        self.file_name = file_name
        self.vector_path = vector_path
        self.dropout_noise = dropout_noise
        self.finetune_size = finetune_size
        self.get_scores = get_scores
        self.reg = reg
        self.amount_of_finetune = amount_of_finetune
        self.amount_of_hidden = amount_of_hidden
        self.output_size = output_size
        self.identity_swap = identity_swap
        self.deep_size = deep_size
        self.from_ae = from_ae
        self.is_identity = is_identity
        self.randomize_finetune_weights = randomize_finetune_weights
        self.corrupt_finetune_weights = corrupt_finetune_weights
        self.deep_size = deep_size
        self.fine_tune_weights_fn = fine_tune_weights_fn
        self.identity_activation = identity_activation
        self.lock_weights_and_redo = lock_weights_and_redo

        print(data_type)

        if optimizer_name == "adagrad":
            self.optimizer = Adagrad()
        elif optimizer_name == "sgd":
            self.optimizer = SGD()
        elif optimizer_name == "rmsprop":
            self.optimizer = RMSprop()
        elif optimizer_name == "adam":
            self.optimizer = Adam()
        elif optimizer_name == "adadelta":
            self.optimizer = Adadelta()
        else:
            print("optimizer not found")
            exit()

        entity_vectors = np.asarray(dt.import2dArray(self.vector_path))
        print("Imported vectors", len(entity_vectors), len(entity_vectors[0]))

        if get_nnet_vectors_path is not None:
            nnet_vectors = np.asarray(dt.import2dArray(get_nnet_vectors_path))
            print("Imported vectors", len(entity_vectors),
                  len(entity_vectors[0]))

        entity_classes = np.asarray(dt.import2dArray(self.class_path))
        print("Imported classes", len(entity_classes), len(entity_classes[0]))

        if fine_tune_weights_fn is None:
            vector_names = dt.import1dArray(vector_names_fn)
            limited_labels = dt.import1dArray(limited_label_fn)
            entity_vectors = np.asarray(
                dt.match_entities(entity_vectors, limited_labels,
                                  vector_names))

        if fine_tune_weights_fn is not None:
            if len(entity_vectors) != len(entity_classes):
                entity_classes = entity_classes.transpose()
                print("Transposed classes, now in form", len(entity_classes),
                      len(entity_classes[0]))
                """
                # IF Bow
                if len(entity_vectors[0]) != len(entity_classes[0]):
                    entity_vectors = entity_vectors.transpose()
                    print("Transposed vectors, now in form", len(entity_vectors), len(entity_vectors[0]))
                """
        elif len(entity_vectors) != len(entity_classes):
            entity_vectors = entity_vectors.transpose()
            print("Transposed vectors, now in form", len(entity_vectors),
                  len(entity_vectors[0]))

        self.input_size = len(entity_vectors[0])
        self.output_size = len(entity_classes[0])

        if fine_tune_weights_fn is not None:
            model_builder = self.fineTuneNetwork
            weights = []
            if from_ae:
                self.past_weights = []
                past_model_weights = []
                for p in past_model_weights_fn:
                    past_model_weights.append(
                        np.asarray(dt.import2dArray(p), dtype="float64"))
                past_model_bias = []
                for p in past_model_bias_fn:
                    past_model_bias.append(
                        np.asarray(dt.import1dArray(p, "f"), dtype="float64"))

                for p in range(len(past_model_weights)):
                    past_model_weights[p] = np.around(past_model_weights[p],
                                                      decimals=6)
                    past_model_bias[p] = np.around(past_model_bias[p],
                                                   decimals=6)

                for p in range(len(past_model_weights)):
                    self.past_weights.append([])
                    self.past_weights[p].append(past_model_weights[p])
                    self.past_weights[p].append(past_model_bias[p])
            for f in fine_tune_weights_fn:
                weights.extend(dt.import2dArray(f))

            r = np.asarray(weights, dtype="float64")
            r = np.asarray(weights, dtype="float64")

            for a in range(len(r)):
                r[a] = np.around(r[a], decimals=6)

            for a in range(len(entity_classes)):
                entity_classes[a] = np.around(entity_classes[a], decimals=6)

            self.fine_tune_weights = []
            self.fine_tune_weights.append(r.transpose())
            self.fine_tune_weights.append(
                np.zeros(shape=len(r), dtype="float64"))
        else:
            model_builder = self.classifierNetwork

        # Converting labels to categorical
        f1_scores = []
        accuracy_scores = []
        f1_averages = []
        accuracy_averages = []

        original_fn = file_name
        x_train, y_train, x_test, y_test, x_dev, y_dev = split_data.splitData(
            vectors, labels[l], data_type)

        if development:
            x_test = x_dev
            y_test = y_dev

        model = model_builder()

        if get_scores:
            test_pred = model.predict(x_train).transpose()
            print(test_pred)
            highest_vals = [0.5] * len(test_pred)  # Default 0.5
            y_pred = model.predict(x_test).transpose()
            y_test = np.asarray(y_test).transpose()
            for y in range(len(y_pred)):
                y_pred[y][y_pred[y] >= highest_vals[y]] = 1
                y_pred[y][y_pred[y] < highest_vals[y]] = 0
            f1_array = []
            accuracy_array = []
            for y in range(len(y_pred)):
                accuracy_array.append(accuracy_score(y_test[y], y_pred[y]))
                f1_array.append(
                    f1_score(y_test[y], y_pred[y], average="binary"))
                print(f1_array[y])
            y_pred = y_pred.transpose()
            y_test = np.asarray(y_test).transpose()

            micro_average = f1_score(y_test, y_pred, average="micro")

            cv_f1_fn = loc + data_type + "/nnet/scores/F1 " + file_name + ".txt"
            cv_acc_fn = loc + data_type + "/nnet/scores/ACC " + file_name + ".txt"
            dt.write1dArray(f1_array, cv_f1_fn)
            dt.write1dArray(accuracy_array, cv_acc_fn)

            f1_scores.append(f1_array)
            accuracy_scores.append(accuracy_array)
            f1_average = np.average(f1_array)
            accuracy_average = np.average(accuracy_array)
            f1_averages.append(f1_average)
            accuracy_averages.append(accuracy_average)

            print("Average F1 Binary", f1_average, "Acc", accuracy_average)
            print("Micro Average F1", micro_average)

            f1_array.append(f1_average)
            f1_array.append(micro_average)
            accuracy_array.append(accuracy_average)
            accuracy_array.append(0.0)

            scores = [accuracy_array, f1_array]

            csv_fn = loc + data_type + "/nnet/csv/" + csv_fn + ".csv"

            file_names = [file_name + "ACC", file_name + "F1"]
            label_names = dt.import1dArray(label_names_fn)
            if dt.fileExists(csv_fn):
                print("File exists, writing to csv")
                try:
                    dt.write_to_csv(csv_fn, file_names, scores)
                except PermissionError:
                    print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                    dt.write_to_csv(
                        csv_fn[:len(csv_fn) - 4] + str(random.random()) +
                        "FAIL.csv", [file_name], scores)
            else:
                print("File does not exist, recreating csv")
                key = []
                for l in label_names:
                    key.append(l)
                key.append("AVERAGE")
                key.append("MICRO AVERAGE")
                dt.write_csv(csv_fn, file_names, scores, key)

            if save_outputs:
                if limit_entities is False:
                    self.output_clusters = model.predict(nnet_vectors)
                else:
                    self.output_clusters = model.predict(entity_vectors)
                self.output_clusters = self.output_clusters.transpose()
                dt.write2dArray(self.output_clusters, rank_fn)

            for l in range(0, len(model.layers) - 1):
                if dropout_noise is not None and dropout_noise > 0.0:
                    if l % 2 == 1:
                        continue
                print("Writing", l, "layer")
                truncated_model = Sequential()
                for a in range(l + 1):
                    truncated_model.add(model.layers[a])
                truncated_model.compile(loss=self.loss, optimizer="sgd")
                if get_nnet_vectors_path is not None:
                    self.end_space = truncated_model.predict(nnet_vectors)
                else:
                    self.end_space = truncated_model.predict(entity_vectors)
                total_file_name = loc + data_type + "/nnet/spaces/" + file_name
                dt.write2dArray(self.end_space,
                                total_file_name + "L" + str(l) + ".txt")

            for l in range(len(model.layers)):
                try:
                    dt.write2dArray(
                        model.layers[l].get_weights()[0], loc + data_type +
                        "/nnet/weights/" + file_name + "L" + str(l) + ".txt")
                    dt.write1dArray(
                        model.layers[l].get_weights()[1], loc + data_type +
                        "/nnet/bias/" + file_name + "L" + str(l) + ".txt")
                except IndexError:
                    print("Layer ", str(l), "Failed")
    def __init__(self,
                 vector_path,
                 class_path,
                 property_names_fn,
                 file_name,
                 svm_type,
                 training_size=10000,
                 lowest_count=200,
                 highest_count=21470000,
                 get_kappa=True,
                 get_f1=True,
                 single_class=True,
                 data_type="movies",
                 getting_directions=True,
                 threads=1,
                 chunk_amt=0,
                 chunk_id=0,
                 rewrite_files=False,
                 classification="all",
                 loc="../data/"):

        self.get_kappa = True
        self.get_f1 = get_f1
        self.data_type = data_type
        self.classification = classification
        self.lowest_amt = lowest_count
        self.higher_amt = highest_count

        if chunk_amt > 0:
            file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str(
                chunk_amt)

        directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt"
        ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt"
        kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt"
        acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt"

        all_fns = [directions_fn, kappa_fn]
        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "getSVMResults")
            return
        else:
            print("Running task", "getSVMResults")

        y_train = 0
        y_test = 0
        vectors = np.asarray(dt.import2dArray(vector_path))
        print("imported vectors")
        if not getting_directions:
            classes = np.asarray(dt.import2dArray(class_path))
            print("imported classes")
        property_names = dt.import1dArray(property_names_fn)
        print("imported propery names")
        if chunk_amt > 0:
            if chunk_id == chunk_amt - 1:
                chunk = int(len(property_names) / chunk_amt)
                multiply = chunk_amt - 1
                property_names = property_names[chunk * multiply:]
            else:
                property_names = dt.chunks(
                    property_names, int(
                        (len(property_names) / chunk_amt)))[chunk_id]

        if not getting_directions:
            x_train, x_test, y_train, y_test = train_test_split(vectors,
                                                                classes,
                                                                test_size=0.3,
                                                                random_state=0)
        else:
            x_train = vectors
            x_test = vectors

        if get_f1:
            y_train = y_train.transpose()
            y_test = y_test.transpose()
            print("transpoosed")
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

        if self.get_f1 is False:
            print("running svms")
            kappa_scores, directions, ktau_scores, property_names = self.runAllSVMs(
                y_test, y_train, property_names, file_name, svm_type,
                getting_directions, threads)

            dt.write1dArray(kappa_scores, kappa_fn)
            dt.write2dArray(directions, directions_fn)
            dt.write1dArray(ktau_scores, ktau_scores_fn)
            dt.write1dArray(property_names,
                            property_names_fn + file_name + ".txt")
        else:
            final_f1 = []
            final_acc = []
            for y in range(len(y_train)):
                f1, acc = self.runClassifySVM(y_test[y], y_train[y])
                print(f1, acc)
                final_f1.append(f1)
                final_acc.append(acc)
            dt.write1dArray(final_f1, ktau_scores_fn)
            dt.write1dArray(final_acc, acc_fn)
Ejemplo n.º 15
0
    def __init__(self,
                 features_fn,
                 classes_fn,
                 class_names_fn,
                 cluster_names_fn,
                 filename,
                 training_data,
                 max_depth=None,
                 balance=None,
                 criterion="entropy",
                 save_details=False,
                 data_type="movies",
                 cv_splits=5,
                 csv_fn="../data/temp/no_csv_provided.csv",
                 rewrite_files=False,
                 split_to_use=-1,
                 development=False,
                 limit_entities=False,
                 limited_label_fn=None,
                 vector_names_fn=None,
                 clusters_fn="",
                 cluster_duplicates=False,
                 save_results_so_far=False,
                 multi_label=False):

        label_names = dt.import1dArray(class_names_fn)

        filename = filename + str(max_depth)

        all_fns = []
        file_names = ['ACC ' + filename, 'F1 ' + filename]
        acc_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[
            0] + '.scores'
        prediction_fn = '../data/' + data_type + '/rules/tree_output/' + filename + '.scores'
        f1_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[
            1] + '.scores'
        all_top_names_fn = "../data/" + data_type + "/rules/names/" + filename + ".txt"
        all_top_rankings_fn = "../data/" + data_type + "/rules/rankings/" + filename + ".txt"
        all_top_clusters_fn = "../data/" + data_type + "/rules/clusters/" + filename + ".txt"

        fns_name = "../data/" + data_type + "/rules/names/" + filename + label_names[
            0] + ".txt"
        features_name = "../data/" + data_type + "/rules/rankings/" + filename + label_names[
            0] + ".txt"
        dt_clusters_name = "../data/" + data_type + "/rules/clusters/" + filename + label_names[
            0] + ".txt"
        if save_details is False:
            all_fns = [acc_fn, f1_fn, prediction_fn, csv_fn]
        else:
            new_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[
                0] + " " + filename + '.png'
            all_fns = [acc_fn, f1_fn, prediction_fn, csv_fn]

        if max_depth is not None:
            all_fns.append(all_top_names_fn)
            all_fns.append(all_top_rankings_fn)
            all_fns.append(all_top_clusters_fn)

        if save_details:
            orig_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[
                0] + " " + filename + 'orig.txt'
            # all_fns.append(orig_dot_file_fn)
            model_name_fn = "../data/" + data_type + "/rules/tree_model/" + label_names[
                0] + " " + filename + ".model"
            #all_fns.append(model_name_fn)

        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "DecisionTree")
            return
        else:
            print("Running task", "DecisionTree")

        vectors = np.asarray(dt.import2dArray(features_fn))
        if data_type == "sentiment":  # If it's just a binary class...
            labels = np.asarray(dt.import1dArray(classes_fn, "i"))
        else:
            labels = np.asarray(dt.import2dArray(classes_fn, "i"))

            print("vectors", len(vectors), len(vectors[0]))
            print("labels", len(labels), len(labels[0]))

        if data_type == "sentiment" or len(vectors) != len(labels[0]):
            vectors = vectors.transpose()

        print("vectors", len(vectors), len(vectors[0]))
        cluster_names = dt.import2dArray(cluster_names_fn, "s")
        clusters = dt.import2dArray(clusters_fn, "f")
        original_vectors = vectors

        if "ratings" in classes_fn:
            orig_path = "/".join(classes_fn.split("/")[:-1]) + "/"
            match_ids_fn = orig_path + "matched_ids.txt"
            if os.path.exists(match_ids_fn):
                matched_ids = dt.import1dArray(match_ids_fn, "i")
            else:
                vector_names = dt.import1dArray(vector_names_fn)
                limited_labels = dt.import1dArray(limited_label_fn)
                matched_ids = dt.match_entities(vector_names, limited_labels)
                dt.write1dArray(matched_ids, match_ids_fn)
            vectors = vectors[matched_ids]
            print("vectors", len(vectors))
        print("Past limit entities")

        for l in range(len(label_names)):
            if label_names[l][:6] == "class-":
                label_names[l] = label_names[l][6:]

        f1_array = []
        accuracy_array = []
        prec_array = []
        recall_array = []

        if not multi_label and data_type != "sentiment":
            labels = labels.transpose()
            print("labels transposed")
            print("labels", len(labels), len(labels[0]))
        else:
            labels = [labels]

        all_top_clusters = []
        all_top_rankings = []
        all_top_names = []
        all_top_inds = []

        all_y_test = []
        all_predictions = []
        print("At label prediction")
        for l in range(len(labels)):

            # Select training data with cross validationac_y_test = []
            cv_acc = []
            cv_prec = []
            cv_recall = []
            c = 0
            # If doing cross-validation
            if cv_splits > 1:
                ac_x_train, ac_y_train, ac_x_test, ac_y_test, ac_x_dev, ac_y_dev = split_data.crossValData(
                    cv_splits, vectors, labels[l])
            else:
                x_train, y_train, x_test, y_test, x_dev, y_dev = split_data.splitData(
                    vectors, labels[l], data_type)
                ac_y_train = [x_train]
                ac_x_train = [y_train]
                ac_x_test = [x_test]
                ac_y_test = [y_test]
                ac_y_dev = [x_dev]
                ac_x_dev = [y_dev]
            if development:
                ac_x_test = ac_x_dev
                ac_y_test = ac_y_dev

            for splits in range(len(ac_y_test)):
                model_name_fn = "../data/" + data_type + "/rules/tree_model/" + label_names[
                    l] + " " + filename + ".model"
                """
                if dt.fileExists(model_name_fn) and not rewrite_files:
                    try:
                        clf = joblib.load(model_name_fn)
                    except KeyError:
                        print(model_name_fn) # If a model is disrupted partway through its processing
                else:
                """
                clf = tree.DecisionTreeClassifier(max_depth=max_depth,
                                                  criterion=criterion,
                                                  class_weight=balance)
                clf.fit(ac_x_train[splits], ac_y_train[splits])
                joblib.dump(clf, model_name_fn)
                predictions.append(clf.predict(ac_x_test[splits]))

            ac_y_test = list(ac_y_test)
            predictions = list(predictions)

            for i in range(len(predictions)):

                print(scores)
                class_names = ["NOT " + label_names[l], label_names[l]]

                # Export a tree for each label predicted by the clf
                if save_details:
                    orig_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[
                        l] + " " + filename + 'orig.txt'
                    new_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[
                        l] + " " + filename + '.txt'
                    orig_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[
                        l] + " " + filename + 'orig.png'
                    new_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[
                        l] + " " + filename + '.png'
                    orig_temp_graph_png_fn = '../data/' + data_type + '/rules/tree_temp/' + label_names[
                        l] + " " + filename + 'orig.png'
                    new_temp_graph_png_fn = '../data/' + data_type + '/rules/tree_temp/' + label_names[
                        l] + " " + filename + '.png'
                    output_names = []
                    for c in cluster_names:
                        line = ""
                        counter = 0
                        for i in range(len(c)):
                            line = line + c[i] + " "
                            counter += 1
                            if counter == 8:
                                break
                        output_names.append(line)
                    failed = False
                    try:
                        tree.export_graphviz(
                            clf,
                            feature_names=output_names,
                            class_names=class_names,
                            out_file=orig_dot_file_fn,
                            max_depth=max_depth,
                            label='all',
                            filled=True,
                            impurity=True,
                            node_ids=True,
                            proportion=True,
                            rounded=True,
                        )
                    except FileNotFoundError:
                        try:
                            orig_dot_file_fn = "//?/" + orig_dot_file_fn
                            tree.export_graphviz(clf,
                                                 feature_names=output_names,
                                                 class_names=class_names,
                                                 out_file=orig_dot_file_fn,
                                                 max_depth=max_depth,
                                                 label='all',
                                                 filled=True,
                                                 impurity=True,
                                                 node_ids=True,
                                                 proportion=True,
                                                 rounded=True)

                        except FileNotFoundError:
                            failed = True
                            print("doesnt work fam")
                    if failed == False:
                        rewrite_dot_file = dt.import1dArray(orig_dot_file_fn)
                        new_dot_file = []
                        max = 3
                        min = -3
                        """
                        for f in original_vectors:
                            for n in f:
                                if n > max:
                                    max = n
                                if n < min:
                                    min = n
                        """
                        print(max)
                        print(min)
                        boundary = max - min
                        boundary = boundary / 5
                        bound_1 = 0 - boundary * 2
                        bound_2 = 0 - boundary * 1
                        bound_3 = 0
                        bound_4 = 0 + boundary
                        bound_5 = 0 + boundary * 2
                        for s in rewrite_dot_file:
                            if ":" in s:
                                s = s.split("<=")
                                no_num = s[0]
                                num = s[1]
                                num = num.split()
                                end = " ".join(num[:-1])
                                num_split = num[0].split("\\")
                                num = num_split[0]
                                end = end[len(num):]
                                num = float(num)
                                replacement = ""
                                if num <= bound_2:
                                    replacement = "VERY LOW"
                                elif num <= bound_3:
                                    replacement = "VERY LOW - LOW"
                                elif num <= bound_4:
                                    replacement = "VERY LOW - AVERAGE"
                                elif num <= bound_5:
                                    replacement = "VERY LOW - HIGH"
                                elif num >= bound_5:
                                    replacement = "VERY HIGH"
                                new_string_a = [no_num, replacement, end]
                                new_string = " ".join(new_string_a)
                                new_dot_file.append(new_string)
                                if "]" in new_string:
                                    if '"' not in new_string[len(new_string) -
                                                             10:]:
                                        for c in range(len(new_string)):
                                            if new_string[c + 1] == "]":
                                                new_string = new_string[:
                                                                        c] + '"' + new_string[
                                                                            c:]
                                                break
                            else:
                                new_dot_file.append(s)
                            """
                            new_string = s
                            if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s:
                                index = s.index("value")
                                new_string = s[:index] + '"] ;'
                            new_dot_file.append(new_string)
                            """
                            #new_dot_file.append(s)
                        dt.write1dArray(new_dot_file, new_dot_file_fn)
                        try:
                            orig_graph = pydot.graph_from_dot_file(
                                orig_dot_file_fn)
                            new_graph = pydot.graph_from_dot_file(
                                new_dot_file_fn)
                            orig_graph.write_png(orig_graph_png_fn)
                            new_graph.write_png(new_graph_png_fn)
                            orig_graph.write_png(orig_temp_graph_png_fn)
                            new_graph.write_png(new_temp_graph_png_fn)
                        except FileNotFoundError:
                            orig_graph_png_fn = "//?/" + orig_graph_png_fn
                            try:
                                orig_graph.write_png(orig_graph_png_fn)
                                new_graph_png_fn = "//?/" + new_graph_png_fn
                                new_graph.write_png(new_graph_png_fn)
                            except FileNotFoundError:
                                print("failed graph")

                    self.get_code(clf, output_names, class_names,
                                  label_names[l] + " " + filename, data_type)
                    dt_clusters, features, fns, inds = self.getNodesToDepth(
                        clf, original_vectors, cluster_names, clusters)
                    print(filename + label_names[l])
                    fns_name = "../data/" + data_type + "/rules/names/" + filename + label_names[
                        l] + ".txt"
                    features_name = "../data/" + data_type + "/rules/rankings/" + filename + label_names[
                        l] + ".txt"
                    dt_clusters_name = "../data/" + data_type + "/rules/clusters/" + filename + label_names[
                        l] + ".txt"
                    dt.write2dArray(fns, fns_name)
                    dt.write2dArray(features, features_name)
                    dt.write2dArray(dt_clusters, dt_clusters_name)
                    all_top_rankings.extend(features)
                    all_top_clusters.extend(dt_clusters)
                    all_top_names.extend(fns)
                    all_top_inds.extend(inds)

        print("len clusters", len(all_top_clusters))
        print("len rankings", len(all_top_rankings))
        print("len names", len(all_top_names))

        if len(all_top_clusters) != len(all_top_rankings) or len(
                all_top_clusters) != len(all_top_names):
            print("stop")

        accuracy_array = np.asarray(accuracy_array)
        accuracy_average = np.average(accuracy_array)

        prec_array = np.asarray(prec_array)
        average_prec = np.average(prec_array)

        recall_array = np.asarray(recall_array)
        average_recall = np.average(recall_array)

        f1_average = 2 * ((average_prec * average_recall) /
                          (average_prec + average_recall))

        if math.isnan(f1_average):
            print("NAN", prec, recall)
            f1_average = 0.0
        all_y_test = np.asarray(all_y_test)
        all_predictions = np.asarray(all_predictions)

        micro_average = f1_score(all_y_test, all_predictions, average="micro")

        accuracy_array = accuracy_array.tolist()

        accuracy_array.append(accuracy_average)
        accuracy_array.append(0.0)

        f1_array.append(f1_average)
        f1_array.append(micro_average)

        scores = [accuracy_array, f1_array]

        dt.write1dArray(accuracy_array, acc_fn)
        dt.write1dArray(f1_array, f1_fn)
        dt.write2dArray(all_predictions, prediction_fn)

        if dt.fileExists(csv_fn):
            print("File exists, writing to csv")
            try:
                dt.write_to_csv(csv_fn, file_names, scores)
            except PermissionError:
                print("CSV FILE WAS OPEN, SKIPPING")
            except ValueError:
                print("File does not exist, recreating csv")
                key = []
                for l in label_names:
                    key.append(l)
                key.append("AVERAGE")
                key.append("MICRO AVERAGE")
                dt.write_csv(csv_fn, file_names, scores, key)
        else:
            print("File does not exist, recreating csv")
            key = []
            for l in label_names:
                key.append(l)
            key.append("AVERAGE")
            key.append("MICRO AVERAGE")
            dt.write_csv(csv_fn, file_names, scores, key)

        if max_depth is not None:
            all_top_names = np.asarray(all_top_names)
            all_top_rankings = np.asarray(all_top_rankings)
            all_top_clusters = np.asarray(all_top_clusters)
            all_top_inds = np.asarray(all_top_inds)

            if cluster_duplicates:
                ind_to_keep = np.unique(all_top_inds, return_index=True)[1]
                all_top_names = all_top_names[ind_to_keep]
                all_top_rankings = all_top_rankings[ind_to_keep]
                all_top_clusters = all_top_clusters[ind_to_keep]

            dt.write2dArray(all_top_names, all_top_names_fn)
            dt.write2dArray(all_top_rankings, all_top_rankings_fn)
            dt.write2dArray(all_top_clusters, all_top_clusters_fn)