コード例 #1
0
def getAllPhraseRankings(directions_fn=None, vectors_fn=None, property_names_fn=None, vector_names_fn=None, fn="no filename",
                         percentage_increment=1, scores_fn = None, top_amt=0, discrete=False, data_type="movies",
                 rewrite_files=False):
    rankings_fn_all = "../data/" + data_type + "/rank/numeric/" + fn + "ALL.txt"

    all_fns = [rankings_fn_all]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", "getAllPhraseRankings")
        return
    else:
        print("Running task", "getAllPhraseRankings")

    directions = dt.import2dArray(directions_fn)
    vectors = dt.import2dArray(vectors_fn)
    property_names = dt.import1dArray(property_names_fn)
    vector_names = dt.import1dArray(vector_names_fn)
    if top_amt != 0:
        scores = dt.import1dArray(scores_fn, "f")
        directions = dt.sortByReverseArray(directions, scores)[:top_amt]
        property_names = dt.sortByReverseArray(property_names, scores)[:top_amt]

    rankings = getRankings(directions, vectors, property_names, vector_names)
    if discrete:
        discrete_labels = createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
    for a in range(len(rankings)):
        rankings[a] = np.around(rankings[a], decimals=4)
    #dt.write1dArray(property_names, "../data/movies/bow/names/top5kof17k.txt")

    dt.write2dArray(rankings, rankings_fn_all)
コード例 #2
0
def selectCutOffByWordVector(cutoff_fn, cluster_dict_fn, file_name):
    cutoff = dt.import2dArray(cutoff_fn)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    cutoff_words = []
    wv, wvn = dt.getWordVectors()
    cluster_boundary = 2
    cluster_dict_arrays = []
    for key, value in cluster_dict.items():
        cluster_array = []
        cluster_array.append(key)
        for v in value:
            cluster_array.append(v)
        cluster_dict_arrays.append(cluster_array)
    for c in range(len(cutoff)):
        clusters = []
        for i in range(len(cutoff[c])):
            cluster = []
            for x in range(len(cutoff[c])-1, -1, -1):
                if cutoff[c][x] is None or cutoff[c][i] is None:
                    continue
                if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary:
                    cluster.append(cluster_dict_arrays[c][x])
                    cutoff[c][x] = None
                    cluster_dict_arrays[c][x] = None
            if cluster is []:
                continue
            clusters.append(cluster)
        # Get the maximum similarity word vector value for each cluster, across all clusters
        for cl in range(len(clusters)):
            for wa in range(len(clusters[cl])):
                for w in range(len(clusters[cl][wa])):
                    clusters[cl[wa]]


    dt.write2dArray(cutoff_words, "../data/movies/rules/cutoff/"+file_name+"WVN.txt")
コード例 #3
0
def splitDirections(directions_fn, scores_fn, names_fn, is_gini,
                    amt_high_directions, amt_low_directions, high_threshold,
                    low_threshold, half_kappa_half_ndcg):
    directions = np.asarray(dt.import2dArray(directions_fn))
    scores = np.asarray(dt.import1dArray(scores_fn, "f"))
    names = np.asarray(dt.import1dArray(names_fn))

    high_direction_names = []
    low_direction_names = []
    high_directions = []
    low_directions = []
    if len(half_kappa_half_ndcg) > 0:
        kappa_scores = dt.import1dArray(half_kappa_half_ndcg, "f")

    if amt_high_directions > 0 and amt_low_directions > 0:
        if len(half_kappa_half_ndcg) == 0:
            ids = np.flipud(np.argsort(scores))
        else:
            ind1 = np.flipud(np.argsort(scores))[:amt_low_directions / 2]
            ind2 = np.zeros(len(ind1), dtype="int")
            kappa_scores = np.flipud(np.argsort(kappa_scores))
            count = 0
            added = 0
            for i in kappa_scores:
                if i not in ind1:
                    ind2[added] = i
                    added += 1
                if added >= amt_low_directions / 2:
                    break
                count += 1
            shuffle_ind = np.asarray(list(range(0, len(ind1))))
            ids = np.insert(ind1, shuffle_ind, ind2)
        names = names[ids]
        if max(ids) > len(directions):
            directions = np.asarray(directions).transpose()
        directions = directions[ids]
        high_directions = directions[:amt_high_directions]
        low_directions = directions[amt_high_directions:amt_low_directions]
        high_direction_names = names[:amt_high_directions]
        low_direction_names = names[amt_high_directions:amt_low_directions]
        high_directions = high_directions.tolist()
        low_directions = low_directions.tolist()
        high_direction_names = high_direction_names.tolist()
        low_direction_names = low_direction_names.tolist()
    elif high_threshold > 0 and low_threshold > 0:
        for s in range(len(scores)):
            if scores[s] >= high_threshold:
                high_directions.append(directions[s])
                high_direction_names.append(names[s])
            elif scores[s] >= low_threshold:
                low_directions.append(directions[s])
                low_direction_names.append(names[s])
    else:
        print("no thresholds or direction amounts")
        hi = [None]
        li = [None]

    return high_direction_names, low_direction_names, high_directions, low_directions
コード例 #4
0
def saveClusters(directions_fn,
                 scores_fn,
                 names_fn,
                 filename,
                 amt_of_dirs,
                 data_type,
                 cluster_amt,
                 rewrite_files=False,
                 algorithm="meanshift_k"):

    dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt"
    cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt"

    all_fns = [dict_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", saveClusters.__name__)
        return
    else:
        print("Running task", saveClusters.__name__)

    p_dir = dt.import2dArray(directions_fn)
    p_names = dt.import1dArray(names_fn, "s")
    p_scores = dt.import1dArray(scores_fn, "f")

    ids = np.argsort(p_scores)

    p_dir = np.flipud(p_dir[ids])[:amt_of_dirs]
    p_names = np.flipud(p_names[ids])[:amt_of_dirs]
    if algorithm == "meanshift":
        labels = meanShift(p_dir)
    else:
        labels = kMeans(p_dir, cluster_amt)
    unique, counts = np.unique(labels, return_counts=True)

    clusters = []
    dir_clusters = []
    for i in range(len(unique)):
        clusters.append([])
        dir_clusters.append([])
    for i in range(len(labels)):
        clusters[labels[i]].append(p_names[i])
        dir_clusters[labels[i]].append(p_dir[i])
    cluster_directions = []
    for l in range(len(dir_clusters)):
        cluster_directions.append(dt.mean_of_array(dir_clusters[l]))

    print("------------------------")
    for c in clusters:
        print(c)
    print("------------------------")

    dt.write2dArray(clusters, dict_fn)
    dt.write2dArray(cluster_directions, cluster_directions_fn)
コード例 #5
0
def getAllRankings(directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, percent, percentage_increment, by_vector, fn, discrete=True, data_type="movies",
                 rewrite_files=False):

    #labels_fn = "../data/"+data_type+"/rank/labels/" + fn + ".txt"
    rankings_fn = "../data/"+data_type+"/rank/numeric/" + fn + ".txt"
    #discrete_labels_fn = "../data/"+data_type+"/rank/discrete/" + fn + ".txt"

    all_fns = [rankings_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        for f in all_fns:
            print(f, "Already exists")
        print("Skipping task", "getAllRankings")
        return
    else:
        print("Running task", "getAllRankings")

    directions = dt.import2dArray(directions_fn)
    vectors = dt.import2dArray(vectors_fn)
    cluster_names = dt.import1dArray(cluster_names_fn)
    vector_names = dt.import1dArray(vector_names_fn)
    rankings = getRankings(directions, vectors, cluster_names, vector_names)
    rankings = np.asarray(rankings)
    if discrete:
        labels = createLabels(rankings, percent)
        labels = np.asarray(labels)
        discrete_labels = createDiscreteLabels(rankings, percentage_increment)
        discrete_labels = np.asarray(discrete_labels)
    if by_vector:
        labels = labels.transpose()
        if discrete:
            discrete_labels = discrete_labels.transpose()
        rankings = rankings.transpose()
    if discrete:
        dt.write2dArray(labels, labels_fn)

    dt.write2dArray(rankings, rankings_fn)
    if discrete:
        dt.write2dArray(discrete_labels, discrete_labels_fn)
コード例 #6
0
def main(data_type, clf, highest_amt, lowest_amt, depth, rewrite_files):

    min = lowest_amt
    max = highest_amt
    dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dm"
    dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \
                    + "-" + clf  + "dmround"
    mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "d" + str(depth)
    svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf + "d" + str(depth)
    shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf+ "round"

    term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \
                                           + "-" + clf
    if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]):
        print("all files exist")
        exit()
    if dt.fileExists(dm_fn) is False:
        newsgroups_train = fetch_20newsgroups(subset='train', shuffle=False)
        newsgroups_test = fetch_20newsgroups(subset='test', shuffle=False)

        vectors = np.concatenate((newsgroups_train.data, newsgroups_test.data),
                                 axis=0)
        newsgroups_test = None
        newsgroups_train = None
        # Get sparse tf rep
        tf_vectorizer = CountVectorizer(max_df=highest_amt,
                                        min_df=lowest_amt,
                                        stop_words='english')
        print("completed vectorizer")
        tf = tf_vectorizer.fit_transform(vectors)
        vectors = None
        # Get sparse PPMI rep from sparse tf rep
        print("done ppmisaprse")
        sparse_ppmi = convertPPMISparse(tf)
        # Get sparse Dsim matrix from sparse PPMI rep
        dm = getDissimilarityMatrixSparse(sparse_ppmi)
        dt.write2dArray(dm, dm_fn)
    else:
        dm = dt.import2dArray(dm_fn)
    print("starting mds")
    # Use as input to mds
    mds = createMDS(dm, depth)
    # save MDS
    dt.write2dArray(mds, mds_fn)
コード例 #7
0
def getNDCG(rankings_fn,
            fn,
            data_type,
            bow_fn,
            ppmi_fn,
            lowest_count,
            rewrite_files=False,
            highest_count=0,
            classification=""):

    # Check if the NDCG scores have already been calculated, if they have then skip.
    ndcg_fn = "../data/" + data_type + "/ndcg/" + fn + ".txt"

    all_fns = [ndcg_fn]
    if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
        print("Skipping task", getNDCG.__name__)
        return
    else:
        print("Running task", getNDCG.__name__)

    # Get the file names for the PPMI values for every word and a list of words ("names")
    names = dt.import1dArray("../data/" + data_type + "/bow/names/" + bow_fn)
    ppmi = dt.import2dArray("../data/" + data_type + "/bow/ppmi/" + ppmi_fn)
    # Process the rankings and the PPMI line-by-line so as to not run out of memory
    ndcg_a = []
    #spearman_a = []
    with open(rankings_fn) as rankings:
        r = 0
        for lr in rankings:
            for lp in ppmi:
                # Get the plain-number ranking of the rankings, e.g. "1, 4, 3, 50"
                sorted_indices = np.argsort(
                    list(map(float,
                             lr.strip().split())))[::-1]
                # Convert PPMI scores to floats
                # Get the NDCG score for the PPMI score, which is a valuation, compared to the indice of the rank
                ndcg = ndcg_from_ranking(lp, sorted_indices)

                # Add to array and print
                ndcg_a.append(ndcg)
                print("ndcg", ndcg, names[r], r)
                """
                    smr = spearmanr(ppmi_indices, sorted_indices)[1]
                    spearman_a.append(smr)
                    print("spearman", smr, names[r], r)
                    """
                r += 1
                break
    # Save NDCG
    dt.write1dArray(ndcg_a, ndcg_fn)
コード例 #8
0
def selectCutOffByExplanation(cutoff_fn, cluster_dict_fn, file_name):
    cutoff = dt.import2dArray(cutoff_fn)
    dupe_cutoff = copy.deepcopy(cutoff)
    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    cutoff_words = []
    cluster_boundary = 2
    cluster_dict_arrays = []
    for key, value in cluster_dict.items():
        cluster_array = []
        cluster_array.append(key)
        for v in value:
            cluster_array.append(v)
        cluster_dict_arrays.append(cluster_array)
    explanations = []
    explanation_cutoffs = []
    for c in range(len(cutoff)):
        clusters = []
        for i in range(len(cutoff[c])):
            cluster = []
            for x in range(len(cutoff[c])-1, -1, -1):
                if cutoff[c][x] is None or cutoff[c][i] is None:
                    continue
                if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary:
                    cluster.append(cluster_dict_arrays[c][x])
                    cutoff[c][x] = None
                    cluster_dict_arrays[c][x] = None
            if cluster is []:
                continue
            clusters.append(cluster)
        # Get the m  vvcaximum similarity word vector value for each cluster, across all clusters
        # For each cluster
        explained_cutoff = []
        explained_cutoff_value = []
        for cl in range(len(clusters)):
            if len(clusters[cl]) == 0:
                print ("Skipped")
                continue
            cluster_explanation, winning_index = webapi.getHighestScore(clusters[cl])
            explained_cutoff.append(cluster_explanation+",")

            dict_index = 0
            for h in range(len(cluster_dict_arrays[cl])):
                if cluster_dict_arrays[cl][h] == clusters[cl][winning_index]:
                    dict_index = h
            explained_cutoff_value.append(dupe_cutoff[cl][dict_index])
        explanations.append(explained_cutoff)
        explanation_cutoffs.append(explained_cutoff_value)
    dt.write2dArray(explanations, "../data/movies/rules/final_names/"+file_name+"WVN.txt")
    dt.write2dArray(explanation_cutoffs, "../data/movies/rules/final_cutoff/"+file_name+".txt")
コード例 #9
0
def getDissimilarityMatrixSparse(tf):
    tflen = tf.shape[0]
    dm = np.empty([tflen, tflen], dtype="float64")
    pithing = 2 / pi
    norms = np.empty(tflen, dtype="float64")

    #Calculate norms
    for ei in range(tflen):
        norms[ei] = spl.norm(tf[ei])
        print("norm", ei)

    dot_product = np.zeros([tflen, tflen], dtype="float64")

    use_old_dp = True
    if use_old_dp:
        dot_product = dt.import2dArray("dotproduct.temp")
    else:
        #Calculate dot products
        for ei in range(tflen):
            for ej in range(tflen):
                if dot_product[ej][ei] != 0:
                    dot_product[ei][ej] = dot_product[ej][ei]
                    continue
                dot_product[ei][ej] = tf[ei].dot(tf[ej].T)[0, 0]
            print("dp", ei)
        dt.write2dArray(dot_product, "dotproduct.temp")

    norm_multiplied = np.empty([tflen, tflen], dtype="float64")

    # Calculate dot products
    for ei in range(tflen):
        for ej in range(tflen):
            norm_multiplied[ei][ej] = norms[ei] * norms[ej]
        print("dp", ei)

    norm_multiplied = dt.shortenFloatsNoFn(norm_multiplied)
    dot_product = dt.shortenFloatsNoFn(dot_product)

    #Get angular differences
    for ei in range(tflen):
        for ej in range(tflen):
            ang = pithing * np.arccos(
                dot_product[ei][ej] / norm_multiplied[ei][ej])
            dm[ei][ej] = ang
        print(ei)
    return dm
コード例 #10
0
def getCutOff(cluster_dict_fn,  rankings_fn, file_name):

    cluster_dict = dt.readArrayDict(cluster_dict_fn)
    rankings = dt.importDiscreteVectors(rankings_fn)

    for r in rankings:
        for a in range(len(r)):
            r[a] = int(r[a][:-1])

    cutoff_clusters = []
    counter = 0
    for key, value in cluster_dict.items():
        value.insert(0, key)
        cutoffs = []
        for v in value:
            max_score = 0
            cutoff = 0
            for i in range(1, 101):
                y_pred = []
                for ve in range(len(rankings[counter])):
                    rank = rankings[counter][ve]
                    if rank > i:
                        y_pred.append(0)
                    else:
                        y_pred.append(1)
                y_test = dt.import2dArray("../data/movies/bow/frequency/phrases/class-"+v, "s")
                score = cohen_kappa_score(y_test, y_pred)
                print(v, int(i), "Score", score)
                if score > max_score:
                    max_score = score
                    cutoff = i
            cutoffs.append(cutoff)
            print("Cutoff for", v, "On", key, "Was", str(cutoff))
        cutoff_clusters.append(cutoffs)
        counter+=1
    dt.write2dArray(cutoff_clusters, "../data/movies/rules/cutoff/"+file_name+".txt")
コード例 #11
0
dir_ids = [212,368]
classes = ["horror", "comedy"]

# Create direction graph
file_name = "f200geE300DS[200]DN0.5CTgenresHAtanhCV1 S0 SFT0 allL0"
cluster_fn = "100ndcg KMeans CA400 MC1 MS0.4 ATS1000 DS400"

class1 = np.asarray(dt.import1dArray("../data/movies/classify/genres/class-" + classes[0]), "i")
class2 = np.asarray(dt.import1dArray("../data/movies/classify/genres/class-" + classes[1]), "i")


top_indexes = dt.import1dArray("../data/movies/top_250_imdb.txt")

data_type = "movies"
directions = dt.import2dArray("../data/"+data_type+"/cluster/clusters/" + file_name + cluster_fn + ".txt")
d_names = dt.import1dArray("../data/"+data_type+"/cluster/names/" + file_name + cluster_fn + ".txt")
entities = np.asarray(dt.import2dArray("../data/"+data_type+"/nnet/spaces/"+file_name+".txt"))
e_names = np.asarray(dt.import1dArray("../data/" +data_type+"/nnet/spaces/entitynames.txt"))

class1 = class1[top_indexes]
class2 = class2[top_indexes]

confirmed_indexes = []
for c in range(len(class1)):
    if class1[c] == 1:
        confirmed_indexes.append(c)
for c in range(len(class2)):
    if class2[c] == 1:
        confirmed_indexes.append(c)
コード例 #12
0
    def __init__(self,
                 features_fn,
                 classes_fn,
                 class_names_fn,
                 cluster_names_fn,
                 filename,
                 training_data,
                 max_depth=None,
                 balance=None,
                 criterion="entropy",
                 save_details=False,
                 data_type="movies",
                 cv_splits=5,
                 csv_fn="../data/temp/no_csv_provided.csv",
                 rewrite_files=False,
                 split_to_use=-1,
                 development=False,
                 limit_entities=False,
                 limited_label_fn=None,
                 vector_names_fn=None,
                 clusters_fn="",
                 cluster_duplicates=False,
                 save_results_so_far=False,
                 multi_label=False):

        label_names = dt.import1dArray(class_names_fn)

        filename = filename + str(max_depth)

        all_fns = []
        file_names = ['ACC ' + filename, 'F1 ' + filename]
        acc_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[
            0] + '.scores'
        prediction_fn = '../data/' + data_type + '/rules/tree_output/' + filename + '.scores'
        f1_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[
            1] + '.scores'
        all_top_names_fn = "../data/" + data_type + "/rules/names/" + filename + ".txt"
        all_top_rankings_fn = "../data/" + data_type + "/rules/rankings/" + filename + ".txt"
        all_top_clusters_fn = "../data/" + data_type + "/rules/clusters/" + filename + ".txt"

        fns_name = "../data/" + data_type + "/rules/names/" + filename + label_names[
            0] + ".txt"
        features_name = "../data/" + data_type + "/rules/rankings/" + filename + label_names[
            0] + ".txt"
        dt_clusters_name = "../data/" + data_type + "/rules/clusters/" + filename + label_names[
            0] + ".txt"
        if save_details is False:
            all_fns = [acc_fn, f1_fn, prediction_fn, csv_fn]
        else:
            new_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[
                0] + " " + filename + '.png'
            all_fns = [acc_fn, f1_fn, prediction_fn, csv_fn]

        if max_depth is not None:
            all_fns.append(all_top_names_fn)
            all_fns.append(all_top_rankings_fn)
            all_fns.append(all_top_clusters_fn)

        if save_details:
            orig_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[
                0] + " " + filename + 'orig.txt'
            # all_fns.append(orig_dot_file_fn)
            model_name_fn = "../data/" + data_type + "/rules/tree_model/" + label_names[
                0] + " " + filename + ".model"
            #all_fns.append(model_name_fn)

        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "DecisionTree")
            return
        else:
            print("Running task", "DecisionTree")

        vectors = np.asarray(dt.import2dArray(features_fn))
        if data_type == "sentiment":  # If it's just a binary class...
            labels = np.asarray(dt.import1dArray(classes_fn, "i"))
        else:
            labels = np.asarray(dt.import2dArray(classes_fn, "i"))

            print("vectors", len(vectors), len(vectors[0]))
            print("labels", len(labels), len(labels[0]))

        if data_type == "sentiment" or len(vectors) != len(labels[0]):
            vectors = vectors.transpose()

        print("vectors", len(vectors), len(vectors[0]))
        cluster_names = dt.import2dArray(cluster_names_fn, "s")
        clusters = dt.import2dArray(clusters_fn, "f")
        original_vectors = vectors

        if "ratings" in classes_fn:
            orig_path = "/".join(classes_fn.split("/")[:-1]) + "/"
            match_ids_fn = orig_path + "matched_ids.txt"
            if os.path.exists(match_ids_fn):
                matched_ids = dt.import1dArray(match_ids_fn, "i")
            else:
                vector_names = dt.import1dArray(vector_names_fn)
                limited_labels = dt.import1dArray(limited_label_fn)
                matched_ids = dt.match_entities(vector_names, limited_labels)
                dt.write1dArray(matched_ids, match_ids_fn)
            vectors = vectors[matched_ids]
            print("vectors", len(vectors))
        print("Past limit entities")

        for l in range(len(label_names)):
            if label_names[l][:6] == "class-":
                label_names[l] = label_names[l][6:]

        f1_array = []
        accuracy_array = []
        prec_array = []
        recall_array = []

        if not multi_label and data_type != "sentiment":
            labels = labels.transpose()
            print("labels transposed")
            print("labels", len(labels), len(labels[0]))
        else:
            labels = [labels]

        all_top_clusters = []
        all_top_rankings = []
        all_top_names = []
        all_top_inds = []

        all_y_test = []
        all_predictions = []
        print("At label prediction")
        for l in range(len(labels)):

            # Select training data with cross validationac_y_test = []
            cv_acc = []
            cv_prec = []
            cv_recall = []
            c = 0
            # If doing cross-validation
            if cv_splits > 1:
                ac_x_train, ac_y_train, ac_x_test, ac_y_test, ac_x_dev, ac_y_dev = split_data.crossValData(
                    cv_splits, vectors, labels[l])
            else:
                x_train, y_train, x_test, y_test, x_dev, y_dev = split_data.splitData(
                    vectors, labels[l], data_type)
                ac_y_train = [x_train]
                ac_x_train = [y_train]
                ac_x_test = [x_test]
                ac_y_test = [y_test]
                ac_y_dev = [x_dev]
                ac_x_dev = [y_dev]
            if development:
                ac_x_test = ac_x_dev
                ac_y_test = ac_y_dev

            for splits in range(len(ac_y_test)):
                model_name_fn = "../data/" + data_type + "/rules/tree_model/" + label_names[
                    l] + " " + filename + ".model"
                """
                if dt.fileExists(model_name_fn) and not rewrite_files:
                    try:
                        clf = joblib.load(model_name_fn)
                    except KeyError:
                        print(model_name_fn) # If a model is disrupted partway through its processing
                else:
                """
                clf = tree.DecisionTreeClassifier(max_depth=max_depth,
                                                  criterion=criterion,
                                                  class_weight=balance)
                clf.fit(ac_x_train[splits], ac_y_train[splits])
                joblib.dump(clf, model_name_fn)
                predictions.append(clf.predict(ac_x_test[splits]))

            ac_y_test = list(ac_y_test)
            predictions = list(predictions)

            for i in range(len(predictions)):

                print(scores)
                class_names = ["NOT " + label_names[l], label_names[l]]

                # Export a tree for each label predicted by the clf
                if save_details:
                    orig_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[
                        l] + " " + filename + 'orig.txt'
                    new_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[
                        l] + " " + filename + '.txt'
                    orig_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[
                        l] + " " + filename + 'orig.png'
                    new_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[
                        l] + " " + filename + '.png'
                    orig_temp_graph_png_fn = '../data/' + data_type + '/rules/tree_temp/' + label_names[
                        l] + " " + filename + 'orig.png'
                    new_temp_graph_png_fn = '../data/' + data_type + '/rules/tree_temp/' + label_names[
                        l] + " " + filename + '.png'
                    output_names = []
                    for c in cluster_names:
                        line = ""
                        counter = 0
                        for i in range(len(c)):
                            line = line + c[i] + " "
                            counter += 1
                            if counter == 8:
                                break
                        output_names.append(line)
                    failed = False
                    try:
                        tree.export_graphviz(
                            clf,
                            feature_names=output_names,
                            class_names=class_names,
                            out_file=orig_dot_file_fn,
                            max_depth=max_depth,
                            label='all',
                            filled=True,
                            impurity=True,
                            node_ids=True,
                            proportion=True,
                            rounded=True,
                        )
                    except FileNotFoundError:
                        try:
                            orig_dot_file_fn = "//?/" + orig_dot_file_fn
                            tree.export_graphviz(clf,
                                                 feature_names=output_names,
                                                 class_names=class_names,
                                                 out_file=orig_dot_file_fn,
                                                 max_depth=max_depth,
                                                 label='all',
                                                 filled=True,
                                                 impurity=True,
                                                 node_ids=True,
                                                 proportion=True,
                                                 rounded=True)

                        except FileNotFoundError:
                            failed = True
                            print("doesnt work fam")
                    if failed == False:
                        rewrite_dot_file = dt.import1dArray(orig_dot_file_fn)
                        new_dot_file = []
                        max = 3
                        min = -3
                        """
                        for f in original_vectors:
                            for n in f:
                                if n > max:
                                    max = n
                                if n < min:
                                    min = n
                        """
                        print(max)
                        print(min)
                        boundary = max - min
                        boundary = boundary / 5
                        bound_1 = 0 - boundary * 2
                        bound_2 = 0 - boundary * 1
                        bound_3 = 0
                        bound_4 = 0 + boundary
                        bound_5 = 0 + boundary * 2
                        for s in rewrite_dot_file:
                            if ":" in s:
                                s = s.split("<=")
                                no_num = s[0]
                                num = s[1]
                                num = num.split()
                                end = " ".join(num[:-1])
                                num_split = num[0].split("\\")
                                num = num_split[0]
                                end = end[len(num):]
                                num = float(num)
                                replacement = ""
                                if num <= bound_2:
                                    replacement = "VERY LOW"
                                elif num <= bound_3:
                                    replacement = "VERY LOW - LOW"
                                elif num <= bound_4:
                                    replacement = "VERY LOW - AVERAGE"
                                elif num <= bound_5:
                                    replacement = "VERY LOW - HIGH"
                                elif num >= bound_5:
                                    replacement = "VERY HIGH"
                                new_string_a = [no_num, replacement, end]
                                new_string = " ".join(new_string_a)
                                new_dot_file.append(new_string)
                                if "]" in new_string:
                                    if '"' not in new_string[len(new_string) -
                                                             10:]:
                                        for c in range(len(new_string)):
                                            if new_string[c + 1] == "]":
                                                new_string = new_string[:
                                                                        c] + '"' + new_string[
                                                                            c:]
                                                break
                            else:
                                new_dot_file.append(s)
                            """
                            new_string = s
                            if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s:
                                index = s.index("value")
                                new_string = s[:index] + '"] ;'
                            new_dot_file.append(new_string)
                            """
                            #new_dot_file.append(s)
                        dt.write1dArray(new_dot_file, new_dot_file_fn)
                        try:
                            orig_graph = pydot.graph_from_dot_file(
                                orig_dot_file_fn)
                            new_graph = pydot.graph_from_dot_file(
                                new_dot_file_fn)
                            orig_graph.write_png(orig_graph_png_fn)
                            new_graph.write_png(new_graph_png_fn)
                            orig_graph.write_png(orig_temp_graph_png_fn)
                            new_graph.write_png(new_temp_graph_png_fn)
                        except FileNotFoundError:
                            orig_graph_png_fn = "//?/" + orig_graph_png_fn
                            try:
                                orig_graph.write_png(orig_graph_png_fn)
                                new_graph_png_fn = "//?/" + new_graph_png_fn
                                new_graph.write_png(new_graph_png_fn)
                            except FileNotFoundError:
                                print("failed graph")

                    self.get_code(clf, output_names, class_names,
                                  label_names[l] + " " + filename, data_type)
                    dt_clusters, features, fns, inds = self.getNodesToDepth(
                        clf, original_vectors, cluster_names, clusters)
                    print(filename + label_names[l])
                    fns_name = "../data/" + data_type + "/rules/names/" + filename + label_names[
                        l] + ".txt"
                    features_name = "../data/" + data_type + "/rules/rankings/" + filename + label_names[
                        l] + ".txt"
                    dt_clusters_name = "../data/" + data_type + "/rules/clusters/" + filename + label_names[
                        l] + ".txt"
                    dt.write2dArray(fns, fns_name)
                    dt.write2dArray(features, features_name)
                    dt.write2dArray(dt_clusters, dt_clusters_name)
                    all_top_rankings.extend(features)
                    all_top_clusters.extend(dt_clusters)
                    all_top_names.extend(fns)
                    all_top_inds.extend(inds)

        print("len clusters", len(all_top_clusters))
        print("len rankings", len(all_top_rankings))
        print("len names", len(all_top_names))

        if len(all_top_clusters) != len(all_top_rankings) or len(
                all_top_clusters) != len(all_top_names):
            print("stop")

        accuracy_array = np.asarray(accuracy_array)
        accuracy_average = np.average(accuracy_array)

        prec_array = np.asarray(prec_array)
        average_prec = np.average(prec_array)

        recall_array = np.asarray(recall_array)
        average_recall = np.average(recall_array)

        f1_average = 2 * ((average_prec * average_recall) /
                          (average_prec + average_recall))

        if math.isnan(f1_average):
            print("NAN", prec, recall)
            f1_average = 0.0
        all_y_test = np.asarray(all_y_test)
        all_predictions = np.asarray(all_predictions)

        micro_average = f1_score(all_y_test, all_predictions, average="micro")

        accuracy_array = accuracy_array.tolist()

        accuracy_array.append(accuracy_average)
        accuracy_array.append(0.0)

        f1_array.append(f1_average)
        f1_array.append(micro_average)

        scores = [accuracy_array, f1_array]

        dt.write1dArray(accuracy_array, acc_fn)
        dt.write1dArray(f1_array, f1_fn)
        dt.write2dArray(all_predictions, prediction_fn)

        if dt.fileExists(csv_fn):
            print("File exists, writing to csv")
            try:
                dt.write_to_csv(csv_fn, file_names, scores)
            except PermissionError:
                print("CSV FILE WAS OPEN, SKIPPING")
            except ValueError:
                print("File does not exist, recreating csv")
                key = []
                for l in label_names:
                    key.append(l)
                key.append("AVERAGE")
                key.append("MICRO AVERAGE")
                dt.write_csv(csv_fn, file_names, scores, key)
        else:
            print("File does not exist, recreating csv")
            key = []
            for l in label_names:
                key.append(l)
            key.append("AVERAGE")
            key.append("MICRO AVERAGE")
            dt.write_csv(csv_fn, file_names, scores, key)

        if max_depth is not None:
            all_top_names = np.asarray(all_top_names)
            all_top_rankings = np.asarray(all_top_rankings)
            all_top_clusters = np.asarray(all_top_clusters)
            all_top_inds = np.asarray(all_top_inds)

            if cluster_duplicates:
                ind_to_keep = np.unique(all_top_inds, return_index=True)[1]
                all_top_names = all_top_names[ind_to_keep]
                all_top_rankings = all_top_rankings[ind_to_keep]
                all_top_clusters = all_top_clusters[ind_to_keep]

            dt.write2dArray(all_top_names, all_top_names_fn)
            dt.write2dArray(all_top_rankings, all_top_rankings_fn)
            dt.write2dArray(all_top_clusters, all_top_clusters_fn)
コード例 #13
0
    def __init__(self,
                 features_fn,
                 classes_fn,
                 class_names_fn,
                 cluster_names_fn,
                 filename,
                 max_depth=None,
                 balance=None,
                 criterion="entropy",
                 save_details=False,
                 data_type="movies",
                 cv_splits=5,
                 csv_fn="../data/temp/no_csv_provided.csv",
                 rewrite_files=True,
                 split_to_use=-1,
                 development=False,
                 limit_entities=False,
                 limited_label_fn=None,
                 vector_names_fn=None,
                 pruning=1,
                 save_results_so_far=False):

        vectors = np.asarray(dt.import2dArray(features_fn)).transpose()

        labels = np.asarray(dt.import2dArray(classes_fn, "i"))

        print("vectors", len(vectors), len(vectors[0]))
        print("labels", len(labels), len(labels[0]))
        print("vectors", len(vectors), len(vectors[0]))
        cluster_names = dt.import1dArray(cluster_names_fn)
        label_names = dt.import1dArray(class_names_fn)
        all_fns = []
        file_names = ['ACC J48' + filename, 'F1 J48' + filename]
        acc_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[
            0] + '.scores'
        f1_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[
            1] + '.scores'
        all_fns.append(acc_fn)
        all_fns.append(f1_fn)
        all_fns.append(csv_fn)

        print(dt.allFnsAlreadyExist(all_fns), rewrite_files)

        if dt.allFnsAlreadyExist(
                all_fns) and not rewrite_files or save_results_so_far:
            print("Skipping task", "Weka Tree")
            return
        else:
            print("Running task", "Weka Tree")

        for l in range(len(cluster_names)):
            cluster_names[l] = cluster_names[l].split()[0]
        """
        for l in range(len(label_names)):
            if label_names[l][:6] == "class-":
                label_names[l] = label_names[l][6:]
        """
        f1_array = []
        accuracy_array = []

        labels = labels.transpose()
        print("labels transposed")
        print("labels", len(labels), len(labels[0]))

        if limit_entities is False:
            vector_names = dt.import1dArray(vector_names_fn)
            limited_labels = dt.import1dArray(limited_label_fn)
            vectors = np.asarray(
                dt.match_entities(vectors, limited_labels, vector_names))

        all_y_test = []
        all_predictions = []
        for l in range(len(labels)):

            if balance:
                new_vectors, new_labels = dt.balanceClasses(vectors, labels[l])
            else:
                new_vectors = vectors
                new_labels = labels[l]
            # Select training data with cross validation

            ac_y_test = []
            ac_y_train = []
            ac_x_train = []
            ac_x_test = []
            ac_y_dev = []
            ac_x_dev = []
            cv_f1 = []
            cv_acc = []
            if cv_splits == 1:
                kf = KFold(n_splits=3, shuffle=False, random_state=None)
            else:
                kf = KFold(n_splits=cv_splits,
                           shuffle=False,
                           random_state=None)
            c = 0
            for train, test in kf.split(new_vectors):
                if split_to_use > -1:
                    if c != split_to_use:
                        c += 1
                        continue
                ac_y_test.append(new_labels[test])
                ac_y_train.append(new_labels[train[int(len(train) * 0.2):]])
                val = int(len(train) * 0.2)
                t_val = train[val:]
                nv_t_val = new_vectors[t_val]
                ac_x_train.append(nv_t_val)
                ac_x_test.append(new_vectors[test])
                ac_x_dev.append(new_vectors[train[:int(len(train) * 0.2)]])
                ac_y_dev.append(new_labels[train[:int(len(train) * 0.2)]])
                c += 1
                if cv_splits == 1:
                    break

            predictions = []
            rules = []

            if development:
                ac_x_test = np.copy(np.asarray(ac_x_dev))
                ac_y_test = np.copy(np.asarray(ac_y_dev))

            train_fn = "../data/" + data_type + "/weka/data/" + filename + "Train.txt"
            test_fn = "../data/" + data_type + "/weka/data/" + filename + "Test.txt"

            for splits in range(len(ac_y_test)):

                # Get the weka predictions
                dt.writeArff(ac_x_train[splits], [ac_y_train[splits]],
                             [label_names[splits]],
                             train_fn,
                             header=True)
                dt.writeArff(ac_x_test[splits], [ac_y_test[splits]],
                             [label_names[splits]],
                             test_fn,
                             header=True)
                prediction, rule = self.getWekaPredictions(
                    train_fn + label_names[splits] + ".arff",
                    test_fn + label_names[splits] + ".arff", save_details,
                    pruning)
                predictions.append(prediction)
                rules.append(rule)

            for i in range(len(predictions)):
                if len(predictions) == 1:
                    all_y_test.append(ac_y_test[i])
                    all_predictions.append(predictions[i])
                f1 = f1_score(ac_y_test[i], predictions[i], average="binary")
                accuracy = accuracy_score(ac_y_test[i], predictions[i])
                cv_f1.append(f1)
                cv_acc.append(accuracy)
                scores = [[label_names[l], "f1", f1, "accuracy", accuracy]]
                print(scores)

                # Export a tree for each label predicted by the clf, not sure if this is needed...
                if save_details:
                    data_fn = "../data/" + data_type + "/rules/weka_rules/" + label_names[
                        l] + " " + filename + ".txt"
                    class_names = [label_names[l], "NOT " + label_names[l]]
                    #self.get_code(clf, cluster_names, class_names, label_names[l] + " " + filename, data_type)
                    dt.write1dArray(rules[i].split("\n"), data_fn)
                    dot_file = dt.import1dArray(data_fn)
                    new_dot_file = []
                    for line in dot_file:
                        if "->" not in line and "label" in line and '"t ' not in line and '"f ' not in line:
                            line = line.split('"')
                            line[1] = '"' + cluster_names[int(line[1])] + '"'
                            line = "".join(line)
                        new_dot_file.append(line)
                    dt.write1dArray(new_dot_file, data_fn)
                    graph = pydot.graph_from_dot_file(data_fn)
                    graph.write_png("../data/" + data_type +
                                    "/rules/weka_images/" + label_names[l] +
                                    " " + filename + ".png")
            f1_array.append(np.average(np.asarray(cv_f1)))
            accuracy_array.append(np.average(np.asarray(cv_acc)))

        accuracy_array = np.asarray(accuracy_array)
        accuracy_average = np.average(accuracy_array)
        accuracy_array = accuracy_array.tolist()
        f1_array = np.asarray(f1_array)
        f1_average = np.average(f1_array)
        f1_array = f1_array.tolist()
        micro_average = f1_score(np.asarray(all_y_test),
                                 np.asarray(all_predictions),
                                 average="micro")

        print("Micro F1", micro_average)

        accuracy_array.append(accuracy_average)
        accuracy_array.append(0.0)

        f1_array.append(f1_average)
        f1_array.append(micro_average)

        scores = [accuracy_array, f1_array]

        dt.write1dArray(accuracy_array, acc_fn)
        dt.write1dArray(f1_array, f1_fn)

        print(csv_fn)
        if dt.fileExists(csv_fn):
            print("File exists, writing to csv")
            try:
                dt.write_to_csv(csv_fn, file_names, scores)
            except PermissionError:
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                dt.write_to_csv(
                    csv_fn[:len(csv_fn) - 4] + str(random.random()) +
                    "FAIL.csv", file_names, scores)
        else:
            print("File does not exist, recreating csv")
            key = []
            for l in label_names:
                key.append(l)
            key.append("AVERAGE")
            key.append("MICRO AVERAGE")
            dt.write_csv(csv_fn, file_names, scores, key)
コード例 #14
0
    def __init__(self,
                 vector_path,
                 class_path,
                 property_names_fn,
                 file_name,
                 svm_type,
                 training_size=10000,
                 lowest_count=200,
                 highest_count=21470000,
                 get_kappa=True,
                 get_f1=True,
                 single_class=True,
                 data_type="movies",
                 getting_directions=True,
                 threads=1,
                 chunk_amt=0,
                 chunk_id=0,
                 rewrite_files=False,
                 classification="all",
                 loc="../data/"):

        self.get_kappa = True
        self.get_f1 = get_f1
        self.data_type = data_type
        self.classification = classification
        self.lowest_amt = lowest_count
        self.higher_amt = highest_count

        if chunk_amt > 0:
            file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str(
                chunk_amt)

        directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt"
        ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt"
        kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt"
        acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt"

        all_fns = [directions_fn, kappa_fn]
        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "getSVMResults")
            return
        else:
            print("Running task", "getSVMResults")

        y_train = 0
        y_test = 0
        vectors = np.asarray(dt.import2dArray(vector_path))
        print("imported vectors")
        if not getting_directions:
            classes = np.asarray(dt.import2dArray(class_path))
            print("imported classes")
        property_names = dt.import1dArray(property_names_fn)
        print("imported propery names")
        if chunk_amt > 0:
            if chunk_id == chunk_amt - 1:
                chunk = int(len(property_names) / chunk_amt)
                multiply = chunk_amt - 1
                property_names = property_names[chunk * multiply:]
            else:
                property_names = dt.chunks(
                    property_names, int(
                        (len(property_names) / chunk_amt)))[chunk_id]

        if not getting_directions:
            x_train, x_test, y_train, y_test = train_test_split(vectors,
                                                                classes,
                                                                test_size=0.3,
                                                                random_state=0)
        else:
            x_train = vectors
            x_test = vectors

        if get_f1:
            y_train = y_train.transpose()
            y_test = y_test.transpose()
            print("transpoosed")
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

        if self.get_f1 is False:
            print("running svms")
            kappa_scores, directions, ktau_scores, property_names = self.runAllSVMs(
                y_test, y_train, property_names, file_name, svm_type,
                getting_directions, threads)

            dt.write1dArray(kappa_scores, kappa_fn)
            dt.write2dArray(directions, directions_fn)
            dt.write1dArray(ktau_scores, ktau_scores_fn)
            dt.write1dArray(property_names,
                            property_names_fn + file_name + ".txt")
        else:
            final_f1 = []
            final_acc = []
            for y in range(len(y_train)):
                f1, acc = self.runClassifySVM(y_test[y], y_train[y])
                print(f1, acc)
                final_f1.append(f1)
                final_acc.append(acc)
            dt.write1dArray(final_f1, ktau_scores_fn)
            dt.write1dArray(final_acc, acc_fn)
コード例 #15
0
def main(data_type, vector_size, window_size, min_count, sampling_threshold,
         negative_size, train_epoch, dm, worker_count, train_wv,
         concatenate_wv, use_hierarchical_softmax):
    file_name = "Doc2Vec" + " VS" + str(vector_size) + " WS" + str(window_size) + " MC" + str(min_count) + " ST" + str(
        sampling_threshold) + \
                " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(
        worker_count) + "spacy"
    " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(worker_count) + \
    " TW" + str(train_wv) + " CW" + str(concatenate_wv) + " HS" + str(use_hierarchical_softmax)

    corpus_fn = "../data/raw/" + data_type + "/corpus_processed.txt"

    if os.path.exists(corpus_fn) is False:
        x_train = np.load("../data/raw/" + data_type + "/x_train_w.npy")
        x_test = np.load("../data/raw/" + data_type + "/x_test_w.npy")
        corpus = np.concatenate((x_train, x_test), axis=0)
        text_corpus = np.empty(len(corpus), dtype=np.object)
        for i in range(len(corpus)):
            text_corpus[i] = " ".join(corpus[i])
            print(text_corpus[i])
        dt.write1dArray(text_corpus, corpus_fn)

    embedding_fn = "/home/tom/Downloads/glove.6B/glove.6B.300d.txt"

    model_fn = "../data/" + data_type + "/doc2vec/" + file_name + ".bin"
    vector_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + ".npy"
    score_fn = "../data/" + data_type + "/doc2vec/" + file_name + "catacc.score"

    if os.path.exists(model_fn):
        print("Imported model")
        model = g.utils.SaveLoad.load(model_fn)
    elif file_name[:7] == "Doc2Vec":
        model = doc2Vec(embedding_fn, corpus_fn, vector_size, window_size,
                        min_count, sampling_threshold, negative_size,
                        train_epoch, dm, worker_count, train_wv,
                        concatenate_wv, use_hierarchical_softmax)
        model.save(model_fn)

    if os.path.exists(vector_fn) is False:
        vectors = []
        for d in range(len(model.docvecs)):
            vectors.append(model.docvecs[d])
        np.save(vector_fn, vectors)
    else:
        print("Imported vectors")
        vectors = np.load(vector_fn)

    if os.path.exists(score_fn) is False or file_name[:6] != "Doc2Vec":
        print("Getting score")
        if data_type == "sentiment":
            classes = dt.import1dArray(
                "../data/" + data_type + "/classify/" + data_type +
                "/class-all", "i")
            x_train, y_train, x_test, y_test = sentiment.getSplits(
                vectors, classes)
            scores = linearSVMScore(x_train, y_train, x_test, y_test)
        else:
            classes = dt.import2dArray(
                "../data/" + data_type + "/classify/" + data_type +
                "/class-all", "i")
            x_train, y_train, x_test, y_test = newsgroups.getSplits(
                vectors, classes)
            scores = multiClassLinearSVM(x_train, y_train, x_test, y_test)
        print(scores)
        dt.write1dArray(scores, score_fn)
コード例 #16
0
    def __init__(self,
                 class_path=None,
                 get_scores=False,
                 randomize_finetune_weights=False,
                 dropout_noise=None,
                 amount_of_hidden=0,
                 epochs=1,
                 learn_rate=0.01,
                 loss="mse",
                 batch_size=1,
                 past_model_bias_fn=None,
                 identity_swap=False,
                 reg=0.0,
                 amount_of_finetune=[],
                 output_size=25,
                 hidden_activation="tanh",
                 layer_init="glorot_uniform",
                 output_activation="tanh",
                 deep_size=None,
                 corrupt_finetune_weights=False,
                 split_to_use=-1,
                 hidden_layer_size=100,
                 file_name="unspecified_filename",
                 vector_path=None,
                 is_identity=False,
                 finetune_size=0,
                 data_type="movies",
                 optimizer_name="rmsprop",
                 noise=0.0,
                 fine_tune_weights_fn=None,
                 past_model_weights_fn=None,
                 from_ae=True,
                 save_outputs=False,
                 label_names_fn="",
                 rewrite_files=False,
                 cv_splits=1,
                 cutoff_start=0.2,
                 development=False,
                 class_weight=None,
                 csv_fn=None,
                 tune_vals=False,
                 get_nnet_vectors_path=None,
                 classification_name="all",
                 limit_entities=False,
                 limited_label_fn="",
                 vector_names_fn="",
                 identity_activation="linear",
                 loc="../data/",
                 lock_weights_and_redo=False):

        weights_fn = loc + data_type + "/nnet/weights/" + file_name + "L0.txt"
        bias_fn = loc + data_type + "/nnet/bias/" + file_name + "L0.txt"
        rank_fn = loc + data_type + "/nnet/clusters/" + file_name + ".txt"

        all_fns = [weights_fn, bias_fn, rank_fn]
        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "nnet")
            return
        else:

            print("Running task", "nnet")

        self.class_path = class_path
        self.learn_rate = learn_rate
        self.epochs = epochs
        self.loss = loss
        self.batch_size = batch_size
        self.hidden_activation = hidden_activation
        self.layer_init = layer_init
        self.output_activation = output_activation
        self.hidden_layer_size = hidden_layer_size
        self.file_name = file_name
        self.vector_path = vector_path
        self.dropout_noise = dropout_noise
        self.finetune_size = finetune_size
        self.get_scores = get_scores
        self.reg = reg
        self.amount_of_finetune = amount_of_finetune
        self.amount_of_hidden = amount_of_hidden
        self.output_size = output_size
        self.identity_swap = identity_swap
        self.deep_size = deep_size
        self.from_ae = from_ae
        self.is_identity = is_identity
        self.randomize_finetune_weights = randomize_finetune_weights
        self.corrupt_finetune_weights = corrupt_finetune_weights
        self.deep_size = deep_size
        self.fine_tune_weights_fn = fine_tune_weights_fn
        self.identity_activation = identity_activation
        self.lock_weights_and_redo = lock_weights_and_redo

        print(data_type)

        if optimizer_name == "adagrad":
            self.optimizer = Adagrad()
        elif optimizer_name == "sgd":
            self.optimizer = SGD()
        elif optimizer_name == "rmsprop":
            self.optimizer = RMSprop()
        elif optimizer_name == "adam":
            self.optimizer = Adam()
        elif optimizer_name == "adadelta":
            self.optimizer = Adadelta()
        else:
            print("optimizer not found")
            exit()

        entity_vectors = np.asarray(dt.import2dArray(self.vector_path))
        print("Imported vectors", len(entity_vectors), len(entity_vectors[0]))

        if get_nnet_vectors_path is not None:
            nnet_vectors = np.asarray(dt.import2dArray(get_nnet_vectors_path))
            print("Imported vectors", len(entity_vectors),
                  len(entity_vectors[0]))

        entity_classes = np.asarray(dt.import2dArray(self.class_path))
        print("Imported classes", len(entity_classes), len(entity_classes[0]))

        if fine_tune_weights_fn is None:
            vector_names = dt.import1dArray(vector_names_fn)
            limited_labels = dt.import1dArray(limited_label_fn)
            entity_vectors = np.asarray(
                dt.match_entities(entity_vectors, limited_labels,
                                  vector_names))

        if fine_tune_weights_fn is not None:
            if len(entity_vectors) != len(entity_classes):
                entity_classes = entity_classes.transpose()
                print("Transposed classes, now in form", len(entity_classes),
                      len(entity_classes[0]))
                """
                # IF Bow
                if len(entity_vectors[0]) != len(entity_classes[0]):
                    entity_vectors = entity_vectors.transpose()
                    print("Transposed vectors, now in form", len(entity_vectors), len(entity_vectors[0]))
                """
        elif len(entity_vectors) != len(entity_classes):
            entity_vectors = entity_vectors.transpose()
            print("Transposed vectors, now in form", len(entity_vectors),
                  len(entity_vectors[0]))

        self.input_size = len(entity_vectors[0])
        self.output_size = len(entity_classes[0])

        if fine_tune_weights_fn is not None:
            model_builder = self.fineTuneNetwork
            weights = []
            if from_ae:
                self.past_weights = []
                past_model_weights = []
                for p in past_model_weights_fn:
                    past_model_weights.append(
                        np.asarray(dt.import2dArray(p), dtype="float64"))
                past_model_bias = []
                for p in past_model_bias_fn:
                    past_model_bias.append(
                        np.asarray(dt.import1dArray(p, "f"), dtype="float64"))

                for p in range(len(past_model_weights)):
                    past_model_weights[p] = np.around(past_model_weights[p],
                                                      decimals=6)
                    past_model_bias[p] = np.around(past_model_bias[p],
                                                   decimals=6)

                for p in range(len(past_model_weights)):
                    self.past_weights.append([])
                    self.past_weights[p].append(past_model_weights[p])
                    self.past_weights[p].append(past_model_bias[p])
            for f in fine_tune_weights_fn:
                weights.extend(dt.import2dArray(f))

            r = np.asarray(weights, dtype="float64")
            r = np.asarray(weights, dtype="float64")

            for a in range(len(r)):
                r[a] = np.around(r[a], decimals=6)

            for a in range(len(entity_classes)):
                entity_classes[a] = np.around(entity_classes[a], decimals=6)

            self.fine_tune_weights = []
            self.fine_tune_weights.append(r.transpose())
            self.fine_tune_weights.append(
                np.zeros(shape=len(r), dtype="float64"))
        else:
            model_builder = self.classifierNetwork

        # Converting labels to categorical
        f1_scores = []
        accuracy_scores = []
        f1_averages = []
        accuracy_averages = []

        original_fn = file_name
        x_train, y_train, x_test, y_test, x_dev, y_dev = split_data.splitData(
            vectors, labels[l], data_type)

        if development:
            x_test = x_dev
            y_test = y_dev

        model = model_builder()

        if get_scores:
            test_pred = model.predict(x_train).transpose()
            print(test_pred)
            highest_vals = [0.5] * len(test_pred)  # Default 0.5
            y_pred = model.predict(x_test).transpose()
            y_test = np.asarray(y_test).transpose()
            for y in range(len(y_pred)):
                y_pred[y][y_pred[y] >= highest_vals[y]] = 1
                y_pred[y][y_pred[y] < highest_vals[y]] = 0
            f1_array = []
            accuracy_array = []
            for y in range(len(y_pred)):
                accuracy_array.append(accuracy_score(y_test[y], y_pred[y]))
                f1_array.append(
                    f1_score(y_test[y], y_pred[y], average="binary"))
                print(f1_array[y])
            y_pred = y_pred.transpose()
            y_test = np.asarray(y_test).transpose()

            micro_average = f1_score(y_test, y_pred, average="micro")

            cv_f1_fn = loc + data_type + "/nnet/scores/F1 " + file_name + ".txt"
            cv_acc_fn = loc + data_type + "/nnet/scores/ACC " + file_name + ".txt"
            dt.write1dArray(f1_array, cv_f1_fn)
            dt.write1dArray(accuracy_array, cv_acc_fn)

            f1_scores.append(f1_array)
            accuracy_scores.append(accuracy_array)
            f1_average = np.average(f1_array)
            accuracy_average = np.average(accuracy_array)
            f1_averages.append(f1_average)
            accuracy_averages.append(accuracy_average)

            print("Average F1 Binary", f1_average, "Acc", accuracy_average)
            print("Micro Average F1", micro_average)

            f1_array.append(f1_average)
            f1_array.append(micro_average)
            accuracy_array.append(accuracy_average)
            accuracy_array.append(0.0)

            scores = [accuracy_array, f1_array]

            csv_fn = loc + data_type + "/nnet/csv/" + csv_fn + ".csv"

            file_names = [file_name + "ACC", file_name + "F1"]
            label_names = dt.import1dArray(label_names_fn)
            if dt.fileExists(csv_fn):
                print("File exists, writing to csv")
                try:
                    dt.write_to_csv(csv_fn, file_names, scores)
                except PermissionError:
                    print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE")
                    dt.write_to_csv(
                        csv_fn[:len(csv_fn) - 4] + str(random.random()) +
                        "FAIL.csv", [file_name], scores)
            else:
                print("File does not exist, recreating csv")
                key = []
                for l in label_names:
                    key.append(l)
                key.append("AVERAGE")
                key.append("MICRO AVERAGE")
                dt.write_csv(csv_fn, file_names, scores, key)

            if save_outputs:
                if limit_entities is False:
                    self.output_clusters = model.predict(nnet_vectors)
                else:
                    self.output_clusters = model.predict(entity_vectors)
                self.output_clusters = self.output_clusters.transpose()
                dt.write2dArray(self.output_clusters, rank_fn)

            for l in range(0, len(model.layers) - 1):
                if dropout_noise is not None and dropout_noise > 0.0:
                    if l % 2 == 1:
                        continue
                print("Writing", l, "layer")
                truncated_model = Sequential()
                for a in range(l + 1):
                    truncated_model.add(model.layers[a])
                truncated_model.compile(loss=self.loss, optimizer="sgd")
                if get_nnet_vectors_path is not None:
                    self.end_space = truncated_model.predict(nnet_vectors)
                else:
                    self.end_space = truncated_model.predict(entity_vectors)
                total_file_name = loc + data_type + "/nnet/spaces/" + file_name
                dt.write2dArray(self.end_space,
                                total_file_name + "L" + str(l) + ".txt")

            for l in range(len(model.layers)):
                try:
                    dt.write2dArray(
                        model.layers[l].get_weights()[0], loc + data_type +
                        "/nnet/weights/" + file_name + "L" + str(l) + ".txt")
                    dt.write1dArray(
                        model.layers[l].get_weights()[1], loc + data_type +
                        "/nnet/bias/" + file_name + "L" + str(l) + ".txt")
                except IndexError:
                    print("Layer ", str(l), "Failed")
コード例 #17
0
        csv_rows.append((name_array[i], acc, f1, macro_f1))
        print(csv_rows[i])
    with open("../data/raw/" + data_type + "/test/reps.csv", 'wt') as f:
        writer = csv.writer(f)
        writer.writerow(("name", "acc", "micro f1", "macro f1"))
        writer.writerows(csv_rows)


if __name__ == '__main__':
    fn = "../data/newsgroups/bow/ppmi/class-all-" + str(30) + "-" + str(
        18836) + "-" + "all.npz"
    print("Testing", fn)
    testAll(
        ["mds", "finetune_space", "mds_rankings", "finetune_rankings"],
        [
            dt.import2dArray("../data/newsgroups/nnet/spaces/wvFIXED200.npy"),
            dt.import2dArray(
                "../data/newsgroups/nnet/spaces/sns_ppmi3wvFIXED200CV1S0 SFT0 allL03018836 LR kappa KMeans CA200 MC1 MS0.4 ATS2000 DS400FT BOCFi NT[200]tanh300S6040V1.2L0.npy"
            ),
            dt.import2dArray(
                "../data/newsgroups/rank/numeric/sns_ppmi3wvFIXED200CV1S0 SFT0 allL03018836 LR kappa KMeans CA400 MC1 MS0.4 ATS500 DS800.npy"
            ).transpose(),
            dt.import2dArray(
                "../data/newsgroups/nnet/clusters/sns_ppmi3wvFIXED200CV1S0 SFT0 allL03018836 LR kappa KMeans CA200 MC1 MS0.4 ATS2000 DS400FT BOCFi NT[200]tanh300S6040V1.2.npy"
            ).transpose()
        ],
        [
            dt.import2dArray(
                "../data/newsgroups/classify/newsgroups/class-all", "i"),
            dt.import2dArray(
                "../data/newsgroups/classify/newsgroups/class-all", "i"),
コード例 #18
0
    def __init__(self,
                 vector_path,
                 class_path,
                 property_names_fn,
                 file_name,
                 svm_type,
                 training_size=10000,
                 lowest_count=200,
                 highest_count=21470000,
                 get_kappa=True,
                 get_f1=True,
                 single_class=True,
                 data_type="movies",
                 getting_directions=True,
                 threads=1,
                 chunk_amt=0,
                 chunk_id=0,
                 rewrite_files=False,
                 classification="all",
                 loc="../data/",
                 logistic_regression=False,
                 sparse_array_fn=None,
                 only_these_fn=None):

        self.get_kappa = True
        self.get_f1 = get_f1
        self.data_type = data_type
        self.classification = classification
        self.lowest_amt = lowest_count
        self.higher_amt = highest_count

        if chunk_amt > 0:
            file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str(
                chunk_amt)

        directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt"
        ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt"
        kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt"
        acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt"
        TP_fn = loc + data_type + "/svm/stats/TP " + file_name + ".txt"
        FP_fn = loc + data_type + "/svm/stats/FP " + file_name + ".txt"
        TN_fn = loc + data_type + "/svm/stats/TN " + file_name + ".txt"
        FN_fn = loc + data_type + "/svm/stats/FN " + file_name + ".txt"

        all_fns = [directions_fn, kappa_fn]
        if dt.allFnsAlreadyExist(all_fns) and not rewrite_files:
            print("Skipping task", "getSVMResults")
            return
        else:
            print("Running task", "getSVMResults")

        y_train = 0
        y_test = 0
        vectors = np.asarray(dt.import2dArray(vector_path))
        print("imported vectors")
        if not getting_directions:
            classes = np.asarray(dt.import2dArray(class_path))
            print("imported classes")

        property_names = dt.import1dArray(property_names_fn)
        print("imported propery names")
        if chunk_amt > 0:
            if chunk_id == chunk_amt - 1:
                chunk = int(len(property_names) / chunk_amt)
                multiply = chunk_amt - 1
                property_names = property_names[chunk * multiply:]
            else:
                property_names = dt.chunks(
                    property_names, int(
                        (len(property_names) / chunk_amt)))[chunk_id]

        if sparse_array_fn is not None:
            sparse_array = dt.import2dArray(sparse_array_fn)
        else:
            sparse_array = None

        if sparse_array is not None:
            for s in range(len(sparse_array)):
                if len(np.nonzero(sparse_array[s])[0]) <= 1:
                    print("WILL FAIL", s, len(np.nonzero(sparse_array[s])[0]))
                else:
                    print(len(np.nonzero(sparse_array[s])[0]))

        if not getting_directions:
            x_train, x_test, y_train, y_test = train_test_split(vectors,
                                                                classes,
                                                                test_size=0.3,
                                                                random_state=0)
        else:
            x_train = vectors
            x_test = vectors

        if get_f1:
            y_train = y_train.transpose()
            y_test = y_test.transpose()
            print("transpoosed")
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test

        if only_these_fn is not None:
            only_these = dt.import1dArray(only_these_fn, "s")
            inds = []
            for s in range(len(property_names)):
                for o in only_these:
                    if property_names[s] == o:
                        inds.append(s)
                        break
            sparse_array = sparse_array[inds]
            property_names = property_names[inds]

        if self.get_f1 is False:
            print("running svms")
            kappa_scores, directions, f1_scores, property_names, accs, TPs, FPs, TNs, FNs = self.runAllSVMs(
                y_test, y_train, property_names, file_name, svm_type,
                getting_directions, threads, logistic_regression, sparse_array)

            dt.write1dArray(kappa_scores, kappa_fn)
            dt.write2dArray(directions, directions_fn)
            dt.write1dArray(f1_scores, ktau_scores_fn)
            dt.write1dArray(accs, acc_fn)
            dt.write1dArray(TPs, TP_fn)
            dt.write1dArray(FPs, FP_fn)
            dt.write1dArray(TNs, TN_fn)
            dt.write1dArray(FNs, FN_fn)
            dt.write1dArray(property_names,
                            property_names_fn + file_name + ".txt")
        else:
            final_f1 = []
            final_acc = []
            for y in range(len(y_train)):
                f1, acc = self.runClassifySVM(y_test[y], y_train[y])
                print(f1, acc)
                final_f1.append(f1)
                final_acc.append(acc)
            dt.write1dArray(final_f1, ktau_scores_fn)
            dt.write1dArray(final_acc, acc_fn)