def getAllPhraseRankings(directions_fn=None, vectors_fn=None, property_names_fn=None, vector_names_fn=None, fn="no filename", percentage_increment=1, scores_fn = None, top_amt=0, discrete=False, data_type="movies", rewrite_files=False): rankings_fn_all = "../data/" + data_type + "/rank/numeric/" + fn + "ALL.txt" all_fns = [rankings_fn_all] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getAllPhraseRankings") return else: print("Running task", "getAllPhraseRankings") directions = dt.import2dArray(directions_fn) vectors = dt.import2dArray(vectors_fn) property_names = dt.import1dArray(property_names_fn) vector_names = dt.import1dArray(vector_names_fn) if top_amt != 0: scores = dt.import1dArray(scores_fn, "f") directions = dt.sortByReverseArray(directions, scores)[:top_amt] property_names = dt.sortByReverseArray(property_names, scores)[:top_amt] rankings = getRankings(directions, vectors, property_names, vector_names) if discrete: discrete_labels = createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) for a in range(len(rankings)): rankings[a] = np.around(rankings[a], decimals=4) #dt.write1dArray(property_names, "../data/movies/bow/names/top5kof17k.txt") dt.write2dArray(rankings, rankings_fn_all)
def selectCutOffByWordVector(cutoff_fn, cluster_dict_fn, file_name): cutoff = dt.import2dArray(cutoff_fn) cluster_dict = dt.readArrayDict(cluster_dict_fn) cutoff_words = [] wv, wvn = dt.getWordVectors() cluster_boundary = 2 cluster_dict_arrays = [] for key, value in cluster_dict.items(): cluster_array = [] cluster_array.append(key) for v in value: cluster_array.append(v) cluster_dict_arrays.append(cluster_array) for c in range(len(cutoff)): clusters = [] for i in range(len(cutoff[c])): cluster = [] for x in range(len(cutoff[c])-1, -1, -1): if cutoff[c][x] is None or cutoff[c][i] is None: continue if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary: cluster.append(cluster_dict_arrays[c][x]) cutoff[c][x] = None cluster_dict_arrays[c][x] = None if cluster is []: continue clusters.append(cluster) # Get the maximum similarity word vector value for each cluster, across all clusters for cl in range(len(clusters)): for wa in range(len(clusters[cl])): for w in range(len(clusters[cl][wa])): clusters[cl[wa]] dt.write2dArray(cutoff_words, "../data/movies/rules/cutoff/"+file_name+"WVN.txt")
def splitDirections(directions_fn, scores_fn, names_fn, is_gini, amt_high_directions, amt_low_directions, high_threshold, low_threshold, half_kappa_half_ndcg): directions = np.asarray(dt.import2dArray(directions_fn)) scores = np.asarray(dt.import1dArray(scores_fn, "f")) names = np.asarray(dt.import1dArray(names_fn)) high_direction_names = [] low_direction_names = [] high_directions = [] low_directions = [] if len(half_kappa_half_ndcg) > 0: kappa_scores = dt.import1dArray(half_kappa_half_ndcg, "f") if amt_high_directions > 0 and amt_low_directions > 0: if len(half_kappa_half_ndcg) == 0: ids = np.flipud(np.argsort(scores)) else: ind1 = np.flipud(np.argsort(scores))[:amt_low_directions / 2] ind2 = np.zeros(len(ind1), dtype="int") kappa_scores = np.flipud(np.argsort(kappa_scores)) count = 0 added = 0 for i in kappa_scores: if i not in ind1: ind2[added] = i added += 1 if added >= amt_low_directions / 2: break count += 1 shuffle_ind = np.asarray(list(range(0, len(ind1)))) ids = np.insert(ind1, shuffle_ind, ind2) names = names[ids] if max(ids) > len(directions): directions = np.asarray(directions).transpose() directions = directions[ids] high_directions = directions[:amt_high_directions] low_directions = directions[amt_high_directions:amt_low_directions] high_direction_names = names[:amt_high_directions] low_direction_names = names[amt_high_directions:amt_low_directions] high_directions = high_directions.tolist() low_directions = low_directions.tolist() high_direction_names = high_direction_names.tolist() low_direction_names = low_direction_names.tolist() elif high_threshold > 0 and low_threshold > 0: for s in range(len(scores)): if scores[s] >= high_threshold: high_directions.append(directions[s]) high_direction_names.append(names[s]) elif scores[s] >= low_threshold: low_directions.append(directions[s]) low_direction_names.append(names[s]) else: print("no thresholds or direction amounts") hi = [None] li = [None] return high_direction_names, low_direction_names, high_directions, low_directions
def saveClusters(directions_fn, scores_fn, names_fn, filename, amt_of_dirs, data_type, cluster_amt, rewrite_files=False, algorithm="meanshift_k"): dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt" cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt" all_fns = [dict_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", saveClusters.__name__) return else: print("Running task", saveClusters.__name__) p_dir = dt.import2dArray(directions_fn) p_names = dt.import1dArray(names_fn, "s") p_scores = dt.import1dArray(scores_fn, "f") ids = np.argsort(p_scores) p_dir = np.flipud(p_dir[ids])[:amt_of_dirs] p_names = np.flipud(p_names[ids])[:amt_of_dirs] if algorithm == "meanshift": labels = meanShift(p_dir) else: labels = kMeans(p_dir, cluster_amt) unique, counts = np.unique(labels, return_counts=True) clusters = [] dir_clusters = [] for i in range(len(unique)): clusters.append([]) dir_clusters.append([]) for i in range(len(labels)): clusters[labels[i]].append(p_names[i]) dir_clusters[labels[i]].append(p_dir[i]) cluster_directions = [] for l in range(len(dir_clusters)): cluster_directions.append(dt.mean_of_array(dir_clusters[l])) print("------------------------") for c in clusters: print(c) print("------------------------") dt.write2dArray(clusters, dict_fn) dt.write2dArray(cluster_directions, cluster_directions_fn)
def getAllRankings(directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, percent, percentage_increment, by_vector, fn, discrete=True, data_type="movies", rewrite_files=False): #labels_fn = "../data/"+data_type+"/rank/labels/" + fn + ".txt" rankings_fn = "../data/"+data_type+"/rank/numeric/" + fn + ".txt" #discrete_labels_fn = "../data/"+data_type+"/rank/discrete/" + fn + ".txt" all_fns = [rankings_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: for f in all_fns: print(f, "Already exists") print("Skipping task", "getAllRankings") return else: print("Running task", "getAllRankings") directions = dt.import2dArray(directions_fn) vectors = dt.import2dArray(vectors_fn) cluster_names = dt.import1dArray(cluster_names_fn) vector_names = dt.import1dArray(vector_names_fn) rankings = getRankings(directions, vectors, cluster_names, vector_names) rankings = np.asarray(rankings) if discrete: labels = createLabels(rankings, percent) labels = np.asarray(labels) discrete_labels = createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) if by_vector: labels = labels.transpose() if discrete: discrete_labels = discrete_labels.transpose() rankings = rankings.transpose() if discrete: dt.write2dArray(labels, labels_fn) dt.write2dArray(rankings, rankings_fn) if discrete: dt.write2dArray(discrete_labels, discrete_labels_fn)
def main(data_type, clf, highest_amt, lowest_amt, depth, rewrite_files): min = lowest_amt max = highest_amt dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dm" dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dmround" mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "d" + str(depth) svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "round" term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \ + "-" + clf if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]): print("all files exist") exit() if dt.fileExists(dm_fn) is False: newsgroups_train = fetch_20newsgroups(subset='train', shuffle=False) newsgroups_test = fetch_20newsgroups(subset='test', shuffle=False) vectors = np.concatenate((newsgroups_train.data, newsgroups_test.data), axis=0) newsgroups_test = None newsgroups_train = None # Get sparse tf rep tf_vectorizer = CountVectorizer(max_df=highest_amt, min_df=lowest_amt, stop_words='english') print("completed vectorizer") tf = tf_vectorizer.fit_transform(vectors) vectors = None # Get sparse PPMI rep from sparse tf rep print("done ppmisaprse") sparse_ppmi = convertPPMISparse(tf) # Get sparse Dsim matrix from sparse PPMI rep dm = getDissimilarityMatrixSparse(sparse_ppmi) dt.write2dArray(dm, dm_fn) else: dm = dt.import2dArray(dm_fn) print("starting mds") # Use as input to mds mds = createMDS(dm, depth) # save MDS dt.write2dArray(mds, mds_fn)
def getNDCG(rankings_fn, fn, data_type, bow_fn, ppmi_fn, lowest_count, rewrite_files=False, highest_count=0, classification=""): # Check if the NDCG scores have already been calculated, if they have then skip. ndcg_fn = "../data/" + data_type + "/ndcg/" + fn + ".txt" all_fns = [ndcg_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", getNDCG.__name__) return else: print("Running task", getNDCG.__name__) # Get the file names for the PPMI values for every word and a list of words ("names") names = dt.import1dArray("../data/" + data_type + "/bow/names/" + bow_fn) ppmi = dt.import2dArray("../data/" + data_type + "/bow/ppmi/" + ppmi_fn) # Process the rankings and the PPMI line-by-line so as to not run out of memory ndcg_a = [] #spearman_a = [] with open(rankings_fn) as rankings: r = 0 for lr in rankings: for lp in ppmi: # Get the plain-number ranking of the rankings, e.g. "1, 4, 3, 50" sorted_indices = np.argsort( list(map(float, lr.strip().split())))[::-1] # Convert PPMI scores to floats # Get the NDCG score for the PPMI score, which is a valuation, compared to the indice of the rank ndcg = ndcg_from_ranking(lp, sorted_indices) # Add to array and print ndcg_a.append(ndcg) print("ndcg", ndcg, names[r], r) """ smr = spearmanr(ppmi_indices, sorted_indices)[1] spearman_a.append(smr) print("spearman", smr, names[r], r) """ r += 1 break # Save NDCG dt.write1dArray(ndcg_a, ndcg_fn)
def selectCutOffByExplanation(cutoff_fn, cluster_dict_fn, file_name): cutoff = dt.import2dArray(cutoff_fn) dupe_cutoff = copy.deepcopy(cutoff) cluster_dict = dt.readArrayDict(cluster_dict_fn) cutoff_words = [] cluster_boundary = 2 cluster_dict_arrays = [] for key, value in cluster_dict.items(): cluster_array = [] cluster_array.append(key) for v in value: cluster_array.append(v) cluster_dict_arrays.append(cluster_array) explanations = [] explanation_cutoffs = [] for c in range(len(cutoff)): clusters = [] for i in range(len(cutoff[c])): cluster = [] for x in range(len(cutoff[c])-1, -1, -1): if cutoff[c][x] is None or cutoff[c][i] is None: continue if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary: cluster.append(cluster_dict_arrays[c][x]) cutoff[c][x] = None cluster_dict_arrays[c][x] = None if cluster is []: continue clusters.append(cluster) # Get the m vvcaximum similarity word vector value for each cluster, across all clusters # For each cluster explained_cutoff = [] explained_cutoff_value = [] for cl in range(len(clusters)): if len(clusters[cl]) == 0: print ("Skipped") continue cluster_explanation, winning_index = webapi.getHighestScore(clusters[cl]) explained_cutoff.append(cluster_explanation+",") dict_index = 0 for h in range(len(cluster_dict_arrays[cl])): if cluster_dict_arrays[cl][h] == clusters[cl][winning_index]: dict_index = h explained_cutoff_value.append(dupe_cutoff[cl][dict_index]) explanations.append(explained_cutoff) explanation_cutoffs.append(explained_cutoff_value) dt.write2dArray(explanations, "../data/movies/rules/final_names/"+file_name+"WVN.txt") dt.write2dArray(explanation_cutoffs, "../data/movies/rules/final_cutoff/"+file_name+".txt")
def getDissimilarityMatrixSparse(tf): tflen = tf.shape[0] dm = np.empty([tflen, tflen], dtype="float64") pithing = 2 / pi norms = np.empty(tflen, dtype="float64") #Calculate norms for ei in range(tflen): norms[ei] = spl.norm(tf[ei]) print("norm", ei) dot_product = np.zeros([tflen, tflen], dtype="float64") use_old_dp = True if use_old_dp: dot_product = dt.import2dArray("dotproduct.temp") else: #Calculate dot products for ei in range(tflen): for ej in range(tflen): if dot_product[ej][ei] != 0: dot_product[ei][ej] = dot_product[ej][ei] continue dot_product[ei][ej] = tf[ei].dot(tf[ej].T)[0, 0] print("dp", ei) dt.write2dArray(dot_product, "dotproduct.temp") norm_multiplied = np.empty([tflen, tflen], dtype="float64") # Calculate dot products for ei in range(tflen): for ej in range(tflen): norm_multiplied[ei][ej] = norms[ei] * norms[ej] print("dp", ei) norm_multiplied = dt.shortenFloatsNoFn(norm_multiplied) dot_product = dt.shortenFloatsNoFn(dot_product) #Get angular differences for ei in range(tflen): for ej in range(tflen): ang = pithing * np.arccos( dot_product[ei][ej] / norm_multiplied[ei][ej]) dm[ei][ej] = ang print(ei) return dm
def getCutOff(cluster_dict_fn, rankings_fn, file_name): cluster_dict = dt.readArrayDict(cluster_dict_fn) rankings = dt.importDiscreteVectors(rankings_fn) for r in rankings: for a in range(len(r)): r[a] = int(r[a][:-1]) cutoff_clusters = [] counter = 0 for key, value in cluster_dict.items(): value.insert(0, key) cutoffs = [] for v in value: max_score = 0 cutoff = 0 for i in range(1, 101): y_pred = [] for ve in range(len(rankings[counter])): rank = rankings[counter][ve] if rank > i: y_pred.append(0) else: y_pred.append(1) y_test = dt.import2dArray("../data/movies/bow/frequency/phrases/class-"+v, "s") score = cohen_kappa_score(y_test, y_pred) print(v, int(i), "Score", score) if score > max_score: max_score = score cutoff = i cutoffs.append(cutoff) print("Cutoff for", v, "On", key, "Was", str(cutoff)) cutoff_clusters.append(cutoffs) counter+=1 dt.write2dArray(cutoff_clusters, "../data/movies/rules/cutoff/"+file_name+".txt")
dir_ids = [212,368] classes = ["horror", "comedy"] # Create direction graph file_name = "f200geE300DS[200]DN0.5CTgenresHAtanhCV1 S0 SFT0 allL0" cluster_fn = "100ndcg KMeans CA400 MC1 MS0.4 ATS1000 DS400" class1 = np.asarray(dt.import1dArray("../data/movies/classify/genres/class-" + classes[0]), "i") class2 = np.asarray(dt.import1dArray("../data/movies/classify/genres/class-" + classes[1]), "i") top_indexes = dt.import1dArray("../data/movies/top_250_imdb.txt") data_type = "movies" directions = dt.import2dArray("../data/"+data_type+"/cluster/clusters/" + file_name + cluster_fn + ".txt") d_names = dt.import1dArray("../data/"+data_type+"/cluster/names/" + file_name + cluster_fn + ".txt") entities = np.asarray(dt.import2dArray("../data/"+data_type+"/nnet/spaces/"+file_name+".txt")) e_names = np.asarray(dt.import1dArray("../data/" +data_type+"/nnet/spaces/entitynames.txt")) class1 = class1[top_indexes] class2 = class2[top_indexes] confirmed_indexes = [] for c in range(len(class1)): if class1[c] == 1: confirmed_indexes.append(c) for c in range(len(class2)): if class2[c] == 1: confirmed_indexes.append(c)
def __init__(self, features_fn, classes_fn, class_names_fn, cluster_names_fn, filename, training_data, max_depth=None, balance=None, criterion="entropy", save_details=False, data_type="movies", cv_splits=5, csv_fn="../data/temp/no_csv_provided.csv", rewrite_files=False, split_to_use=-1, development=False, limit_entities=False, limited_label_fn=None, vector_names_fn=None, clusters_fn="", cluster_duplicates=False, save_results_so_far=False, multi_label=False): label_names = dt.import1dArray(class_names_fn) filename = filename + str(max_depth) all_fns = [] file_names = ['ACC ' + filename, 'F1 ' + filename] acc_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[ 0] + '.scores' prediction_fn = '../data/' + data_type + '/rules/tree_output/' + filename + '.scores' f1_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[ 1] + '.scores' all_top_names_fn = "../data/" + data_type + "/rules/names/" + filename + ".txt" all_top_rankings_fn = "../data/" + data_type + "/rules/rankings/" + filename + ".txt" all_top_clusters_fn = "../data/" + data_type + "/rules/clusters/" + filename + ".txt" fns_name = "../data/" + data_type + "/rules/names/" + filename + label_names[ 0] + ".txt" features_name = "../data/" + data_type + "/rules/rankings/" + filename + label_names[ 0] + ".txt" dt_clusters_name = "../data/" + data_type + "/rules/clusters/" + filename + label_names[ 0] + ".txt" if save_details is False: all_fns = [acc_fn, f1_fn, prediction_fn, csv_fn] else: new_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[ 0] + " " + filename + '.png' all_fns = [acc_fn, f1_fn, prediction_fn, csv_fn] if max_depth is not None: all_fns.append(all_top_names_fn) all_fns.append(all_top_rankings_fn) all_fns.append(all_top_clusters_fn) if save_details: orig_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[ 0] + " " + filename + 'orig.txt' # all_fns.append(orig_dot_file_fn) model_name_fn = "../data/" + data_type + "/rules/tree_model/" + label_names[ 0] + " " + filename + ".model" #all_fns.append(model_name_fn) if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "DecisionTree") return else: print("Running task", "DecisionTree") vectors = np.asarray(dt.import2dArray(features_fn)) if data_type == "sentiment": # If it's just a binary class... labels = np.asarray(dt.import1dArray(classes_fn, "i")) else: labels = np.asarray(dt.import2dArray(classes_fn, "i")) print("vectors", len(vectors), len(vectors[0])) print("labels", len(labels), len(labels[0])) if data_type == "sentiment" or len(vectors) != len(labels[0]): vectors = vectors.transpose() print("vectors", len(vectors), len(vectors[0])) cluster_names = dt.import2dArray(cluster_names_fn, "s") clusters = dt.import2dArray(clusters_fn, "f") original_vectors = vectors if "ratings" in classes_fn: orig_path = "/".join(classes_fn.split("/")[:-1]) + "/" match_ids_fn = orig_path + "matched_ids.txt" if os.path.exists(match_ids_fn): matched_ids = dt.import1dArray(match_ids_fn, "i") else: vector_names = dt.import1dArray(vector_names_fn) limited_labels = dt.import1dArray(limited_label_fn) matched_ids = dt.match_entities(vector_names, limited_labels) dt.write1dArray(matched_ids, match_ids_fn) vectors = vectors[matched_ids] print("vectors", len(vectors)) print("Past limit entities") for l in range(len(label_names)): if label_names[l][:6] == "class-": label_names[l] = label_names[l][6:] f1_array = [] accuracy_array = [] prec_array = [] recall_array = [] if not multi_label and data_type != "sentiment": labels = labels.transpose() print("labels transposed") print("labels", len(labels), len(labels[0])) else: labels = [labels] all_top_clusters = [] all_top_rankings = [] all_top_names = [] all_top_inds = [] all_y_test = [] all_predictions = [] print("At label prediction") for l in range(len(labels)): # Select training data with cross validationac_y_test = [] cv_acc = [] cv_prec = [] cv_recall = [] c = 0 # If doing cross-validation if cv_splits > 1: ac_x_train, ac_y_train, ac_x_test, ac_y_test, ac_x_dev, ac_y_dev = split_data.crossValData( cv_splits, vectors, labels[l]) else: x_train, y_train, x_test, y_test, x_dev, y_dev = split_data.splitData( vectors, labels[l], data_type) ac_y_train = [x_train] ac_x_train = [y_train] ac_x_test = [x_test] ac_y_test = [y_test] ac_y_dev = [x_dev] ac_x_dev = [y_dev] if development: ac_x_test = ac_x_dev ac_y_test = ac_y_dev for splits in range(len(ac_y_test)): model_name_fn = "../data/" + data_type + "/rules/tree_model/" + label_names[ l] + " " + filename + ".model" """ if dt.fileExists(model_name_fn) and not rewrite_files: try: clf = joblib.load(model_name_fn) except KeyError: print(model_name_fn) # If a model is disrupted partway through its processing else: """ clf = tree.DecisionTreeClassifier(max_depth=max_depth, criterion=criterion, class_weight=balance) clf.fit(ac_x_train[splits], ac_y_train[splits]) joblib.dump(clf, model_name_fn) predictions.append(clf.predict(ac_x_test[splits])) ac_y_test = list(ac_y_test) predictions = list(predictions) for i in range(len(predictions)): print(scores) class_names = ["NOT " + label_names[l], label_names[l]] # Export a tree for each label predicted by the clf if save_details: orig_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[ l] + " " + filename + 'orig.txt' new_dot_file_fn = '../data/' + data_type + '/rules/tree_data/' + label_names[ l] + " " + filename + '.txt' orig_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[ l] + " " + filename + 'orig.png' new_graph_png_fn = '../data/' + data_type + '/rules/tree_images/' + label_names[ l] + " " + filename + '.png' orig_temp_graph_png_fn = '../data/' + data_type + '/rules/tree_temp/' + label_names[ l] + " " + filename + 'orig.png' new_temp_graph_png_fn = '../data/' + data_type + '/rules/tree_temp/' + label_names[ l] + " " + filename + '.png' output_names = [] for c in cluster_names: line = "" counter = 0 for i in range(len(c)): line = line + c[i] + " " counter += 1 if counter == 8: break output_names.append(line) failed = False try: tree.export_graphviz( clf, feature_names=output_names, class_names=class_names, out_file=orig_dot_file_fn, max_depth=max_depth, label='all', filled=True, impurity=True, node_ids=True, proportion=True, rounded=True, ) except FileNotFoundError: try: orig_dot_file_fn = "//?/" + orig_dot_file_fn tree.export_graphviz(clf, feature_names=output_names, class_names=class_names, out_file=orig_dot_file_fn, max_depth=max_depth, label='all', filled=True, impurity=True, node_ids=True, proportion=True, rounded=True) except FileNotFoundError: failed = True print("doesnt work fam") if failed == False: rewrite_dot_file = dt.import1dArray(orig_dot_file_fn) new_dot_file = [] max = 3 min = -3 """ for f in original_vectors: for n in f: if n > max: max = n if n < min: min = n """ print(max) print(min) boundary = max - min boundary = boundary / 5 bound_1 = 0 - boundary * 2 bound_2 = 0 - boundary * 1 bound_3 = 0 bound_4 = 0 + boundary bound_5 = 0 + boundary * 2 for s in rewrite_dot_file: if ":" in s: s = s.split("<=") no_num = s[0] num = s[1] num = num.split() end = " ".join(num[:-1]) num_split = num[0].split("\\") num = num_split[0] end = end[len(num):] num = float(num) replacement = "" if num <= bound_2: replacement = "VERY LOW" elif num <= bound_3: replacement = "VERY LOW - LOW" elif num <= bound_4: replacement = "VERY LOW - AVERAGE" elif num <= bound_5: replacement = "VERY LOW - HIGH" elif num >= bound_5: replacement = "VERY HIGH" new_string_a = [no_num, replacement, end] new_string = " ".join(new_string_a) new_dot_file.append(new_string) if "]" in new_string: if '"' not in new_string[len(new_string) - 10:]: for c in range(len(new_string)): if new_string[c + 1] == "]": new_string = new_string[: c] + '"' + new_string[ c:] break else: new_dot_file.append(s) """ new_string = s if "->" not in s and "digraph" not in s and "node" not in s and "(...)" not in s and "}" not in s: index = s.index("value") new_string = s[:index] + '"] ;' new_dot_file.append(new_string) """ #new_dot_file.append(s) dt.write1dArray(new_dot_file, new_dot_file_fn) try: orig_graph = pydot.graph_from_dot_file( orig_dot_file_fn) new_graph = pydot.graph_from_dot_file( new_dot_file_fn) orig_graph.write_png(orig_graph_png_fn) new_graph.write_png(new_graph_png_fn) orig_graph.write_png(orig_temp_graph_png_fn) new_graph.write_png(new_temp_graph_png_fn) except FileNotFoundError: orig_graph_png_fn = "//?/" + orig_graph_png_fn try: orig_graph.write_png(orig_graph_png_fn) new_graph_png_fn = "//?/" + new_graph_png_fn new_graph.write_png(new_graph_png_fn) except FileNotFoundError: print("failed graph") self.get_code(clf, output_names, class_names, label_names[l] + " " + filename, data_type) dt_clusters, features, fns, inds = self.getNodesToDepth( clf, original_vectors, cluster_names, clusters) print(filename + label_names[l]) fns_name = "../data/" + data_type + "/rules/names/" + filename + label_names[ l] + ".txt" features_name = "../data/" + data_type + "/rules/rankings/" + filename + label_names[ l] + ".txt" dt_clusters_name = "../data/" + data_type + "/rules/clusters/" + filename + label_names[ l] + ".txt" dt.write2dArray(fns, fns_name) dt.write2dArray(features, features_name) dt.write2dArray(dt_clusters, dt_clusters_name) all_top_rankings.extend(features) all_top_clusters.extend(dt_clusters) all_top_names.extend(fns) all_top_inds.extend(inds) print("len clusters", len(all_top_clusters)) print("len rankings", len(all_top_rankings)) print("len names", len(all_top_names)) if len(all_top_clusters) != len(all_top_rankings) or len( all_top_clusters) != len(all_top_names): print("stop") accuracy_array = np.asarray(accuracy_array) accuracy_average = np.average(accuracy_array) prec_array = np.asarray(prec_array) average_prec = np.average(prec_array) recall_array = np.asarray(recall_array) average_recall = np.average(recall_array) f1_average = 2 * ((average_prec * average_recall) / (average_prec + average_recall)) if math.isnan(f1_average): print("NAN", prec, recall) f1_average = 0.0 all_y_test = np.asarray(all_y_test) all_predictions = np.asarray(all_predictions) micro_average = f1_score(all_y_test, all_predictions, average="micro") accuracy_array = accuracy_array.tolist() accuracy_array.append(accuracy_average) accuracy_array.append(0.0) f1_array.append(f1_average) f1_array.append(micro_average) scores = [accuracy_array, f1_array] dt.write1dArray(accuracy_array, acc_fn) dt.write1dArray(f1_array, f1_fn) dt.write2dArray(all_predictions, prediction_fn) if dt.fileExists(csv_fn): print("File exists, writing to csv") try: dt.write_to_csv(csv_fn, file_names, scores) except PermissionError: print("CSV FILE WAS OPEN, SKIPPING") except ValueError: print("File does not exist, recreating csv") key = [] for l in label_names: key.append(l) key.append("AVERAGE") key.append("MICRO AVERAGE") dt.write_csv(csv_fn, file_names, scores, key) else: print("File does not exist, recreating csv") key = [] for l in label_names: key.append(l) key.append("AVERAGE") key.append("MICRO AVERAGE") dt.write_csv(csv_fn, file_names, scores, key) if max_depth is not None: all_top_names = np.asarray(all_top_names) all_top_rankings = np.asarray(all_top_rankings) all_top_clusters = np.asarray(all_top_clusters) all_top_inds = np.asarray(all_top_inds) if cluster_duplicates: ind_to_keep = np.unique(all_top_inds, return_index=True)[1] all_top_names = all_top_names[ind_to_keep] all_top_rankings = all_top_rankings[ind_to_keep] all_top_clusters = all_top_clusters[ind_to_keep] dt.write2dArray(all_top_names, all_top_names_fn) dt.write2dArray(all_top_rankings, all_top_rankings_fn) dt.write2dArray(all_top_clusters, all_top_clusters_fn)
def __init__(self, features_fn, classes_fn, class_names_fn, cluster_names_fn, filename, max_depth=None, balance=None, criterion="entropy", save_details=False, data_type="movies", cv_splits=5, csv_fn="../data/temp/no_csv_provided.csv", rewrite_files=True, split_to_use=-1, development=False, limit_entities=False, limited_label_fn=None, vector_names_fn=None, pruning=1, save_results_so_far=False): vectors = np.asarray(dt.import2dArray(features_fn)).transpose() labels = np.asarray(dt.import2dArray(classes_fn, "i")) print("vectors", len(vectors), len(vectors[0])) print("labels", len(labels), len(labels[0])) print("vectors", len(vectors), len(vectors[0])) cluster_names = dt.import1dArray(cluster_names_fn) label_names = dt.import1dArray(class_names_fn) all_fns = [] file_names = ['ACC J48' + filename, 'F1 J48' + filename] acc_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[ 0] + '.scores' f1_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[ 1] + '.scores' all_fns.append(acc_fn) all_fns.append(f1_fn) all_fns.append(csv_fn) print(dt.allFnsAlreadyExist(all_fns), rewrite_files) if dt.allFnsAlreadyExist( all_fns) and not rewrite_files or save_results_so_far: print("Skipping task", "Weka Tree") return else: print("Running task", "Weka Tree") for l in range(len(cluster_names)): cluster_names[l] = cluster_names[l].split()[0] """ for l in range(len(label_names)): if label_names[l][:6] == "class-": label_names[l] = label_names[l][6:] """ f1_array = [] accuracy_array = [] labels = labels.transpose() print("labels transposed") print("labels", len(labels), len(labels[0])) if limit_entities is False: vector_names = dt.import1dArray(vector_names_fn) limited_labels = dt.import1dArray(limited_label_fn) vectors = np.asarray( dt.match_entities(vectors, limited_labels, vector_names)) all_y_test = [] all_predictions = [] for l in range(len(labels)): if balance: new_vectors, new_labels = dt.balanceClasses(vectors, labels[l]) else: new_vectors = vectors new_labels = labels[l] # Select training data with cross validation ac_y_test = [] ac_y_train = [] ac_x_train = [] ac_x_test = [] ac_y_dev = [] ac_x_dev = [] cv_f1 = [] cv_acc = [] if cv_splits == 1: kf = KFold(n_splits=3, shuffle=False, random_state=None) else: kf = KFold(n_splits=cv_splits, shuffle=False, random_state=None) c = 0 for train, test in kf.split(new_vectors): if split_to_use > -1: if c != split_to_use: c += 1 continue ac_y_test.append(new_labels[test]) ac_y_train.append(new_labels[train[int(len(train) * 0.2):]]) val = int(len(train) * 0.2) t_val = train[val:] nv_t_val = new_vectors[t_val] ac_x_train.append(nv_t_val) ac_x_test.append(new_vectors[test]) ac_x_dev.append(new_vectors[train[:int(len(train) * 0.2)]]) ac_y_dev.append(new_labels[train[:int(len(train) * 0.2)]]) c += 1 if cv_splits == 1: break predictions = [] rules = [] if development: ac_x_test = np.copy(np.asarray(ac_x_dev)) ac_y_test = np.copy(np.asarray(ac_y_dev)) train_fn = "../data/" + data_type + "/weka/data/" + filename + "Train.txt" test_fn = "../data/" + data_type + "/weka/data/" + filename + "Test.txt" for splits in range(len(ac_y_test)): # Get the weka predictions dt.writeArff(ac_x_train[splits], [ac_y_train[splits]], [label_names[splits]], train_fn, header=True) dt.writeArff(ac_x_test[splits], [ac_y_test[splits]], [label_names[splits]], test_fn, header=True) prediction, rule = self.getWekaPredictions( train_fn + label_names[splits] + ".arff", test_fn + label_names[splits] + ".arff", save_details, pruning) predictions.append(prediction) rules.append(rule) for i in range(len(predictions)): if len(predictions) == 1: all_y_test.append(ac_y_test[i]) all_predictions.append(predictions[i]) f1 = f1_score(ac_y_test[i], predictions[i], average="binary") accuracy = accuracy_score(ac_y_test[i], predictions[i]) cv_f1.append(f1) cv_acc.append(accuracy) scores = [[label_names[l], "f1", f1, "accuracy", accuracy]] print(scores) # Export a tree for each label predicted by the clf, not sure if this is needed... if save_details: data_fn = "../data/" + data_type + "/rules/weka_rules/" + label_names[ l] + " " + filename + ".txt" class_names = [label_names[l], "NOT " + label_names[l]] #self.get_code(clf, cluster_names, class_names, label_names[l] + " " + filename, data_type) dt.write1dArray(rules[i].split("\n"), data_fn) dot_file = dt.import1dArray(data_fn) new_dot_file = [] for line in dot_file: if "->" not in line and "label" in line and '"t ' not in line and '"f ' not in line: line = line.split('"') line[1] = '"' + cluster_names[int(line[1])] + '"' line = "".join(line) new_dot_file.append(line) dt.write1dArray(new_dot_file, data_fn) graph = pydot.graph_from_dot_file(data_fn) graph.write_png("../data/" + data_type + "/rules/weka_images/" + label_names[l] + " " + filename + ".png") f1_array.append(np.average(np.asarray(cv_f1))) accuracy_array.append(np.average(np.asarray(cv_acc))) accuracy_array = np.asarray(accuracy_array) accuracy_average = np.average(accuracy_array) accuracy_array = accuracy_array.tolist() f1_array = np.asarray(f1_array) f1_average = np.average(f1_array) f1_array = f1_array.tolist() micro_average = f1_score(np.asarray(all_y_test), np.asarray(all_predictions), average="micro") print("Micro F1", micro_average) accuracy_array.append(accuracy_average) accuracy_array.append(0.0) f1_array.append(f1_average) f1_array.append(micro_average) scores = [accuracy_array, f1_array] dt.write1dArray(accuracy_array, acc_fn) dt.write1dArray(f1_array, f1_fn) print(csv_fn) if dt.fileExists(csv_fn): print("File exists, writing to csv") try: dt.write_to_csv(csv_fn, file_names, scores) except PermissionError: print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") dt.write_to_csv( csv_fn[:len(csv_fn) - 4] + str(random.random()) + "FAIL.csv", file_names, scores) else: print("File does not exist, recreating csv") key = [] for l in label_names: key.append(l) key.append("AVERAGE") key.append("MICRO AVERAGE") dt.write_csv(csv_fn, file_names, scores, key)
def __init__(self, vector_path, class_path, property_names_fn, file_name, svm_type, training_size=10000, lowest_count=200, highest_count=21470000, get_kappa=True, get_f1=True, single_class=True, data_type="movies", getting_directions=True, threads=1, chunk_amt=0, chunk_id=0, rewrite_files=False, classification="all", loc="../data/"): self.get_kappa = True self.get_f1 = get_f1 self.data_type = data_type self.classification = classification self.lowest_amt = lowest_count self.higher_amt = highest_count if chunk_amt > 0: file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str( chunk_amt) directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt" ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt" kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt" acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt" all_fns = [directions_fn, kappa_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getSVMResults") return else: print("Running task", "getSVMResults") y_train = 0 y_test = 0 vectors = np.asarray(dt.import2dArray(vector_path)) print("imported vectors") if not getting_directions: classes = np.asarray(dt.import2dArray(class_path)) print("imported classes") property_names = dt.import1dArray(property_names_fn) print("imported propery names") if chunk_amt > 0: if chunk_id == chunk_amt - 1: chunk = int(len(property_names) / chunk_amt) multiply = chunk_amt - 1 property_names = property_names[chunk * multiply:] else: property_names = dt.chunks( property_names, int( (len(property_names) / chunk_amt)))[chunk_id] if not getting_directions: x_train, x_test, y_train, y_test = train_test_split(vectors, classes, test_size=0.3, random_state=0) else: x_train = vectors x_test = vectors if get_f1: y_train = y_train.transpose() y_test = y_test.transpose() print("transpoosed") self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test if self.get_f1 is False: print("running svms") kappa_scores, directions, ktau_scores, property_names = self.runAllSVMs( y_test, y_train, property_names, file_name, svm_type, getting_directions, threads) dt.write1dArray(kappa_scores, kappa_fn) dt.write2dArray(directions, directions_fn) dt.write1dArray(ktau_scores, ktau_scores_fn) dt.write1dArray(property_names, property_names_fn + file_name + ".txt") else: final_f1 = [] final_acc = [] for y in range(len(y_train)): f1, acc = self.runClassifySVM(y_test[y], y_train[y]) print(f1, acc) final_f1.append(f1) final_acc.append(acc) dt.write1dArray(final_f1, ktau_scores_fn) dt.write1dArray(final_acc, acc_fn)
def main(data_type, vector_size, window_size, min_count, sampling_threshold, negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax): file_name = "Doc2Vec" + " VS" + str(vector_size) + " WS" + str(window_size) + " MC" + str(min_count) + " ST" + str( sampling_threshold) + \ " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str( worker_count) + "spacy" " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(worker_count) + \ " TW" + str(train_wv) + " CW" + str(concatenate_wv) + " HS" + str(use_hierarchical_softmax) corpus_fn = "../data/raw/" + data_type + "/corpus_processed.txt" if os.path.exists(corpus_fn) is False: x_train = np.load("../data/raw/" + data_type + "/x_train_w.npy") x_test = np.load("../data/raw/" + data_type + "/x_test_w.npy") corpus = np.concatenate((x_train, x_test), axis=0) text_corpus = np.empty(len(corpus), dtype=np.object) for i in range(len(corpus)): text_corpus[i] = " ".join(corpus[i]) print(text_corpus[i]) dt.write1dArray(text_corpus, corpus_fn) embedding_fn = "/home/tom/Downloads/glove.6B/glove.6B.300d.txt" model_fn = "../data/" + data_type + "/doc2vec/" + file_name + ".bin" vector_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + ".npy" score_fn = "../data/" + data_type + "/doc2vec/" + file_name + "catacc.score" if os.path.exists(model_fn): print("Imported model") model = g.utils.SaveLoad.load(model_fn) elif file_name[:7] == "Doc2Vec": model = doc2Vec(embedding_fn, corpus_fn, vector_size, window_size, min_count, sampling_threshold, negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax) model.save(model_fn) if os.path.exists(vector_fn) is False: vectors = [] for d in range(len(model.docvecs)): vectors.append(model.docvecs[d]) np.save(vector_fn, vectors) else: print("Imported vectors") vectors = np.load(vector_fn) if os.path.exists(score_fn) is False or file_name[:6] != "Doc2Vec": print("Getting score") if data_type == "sentiment": classes = dt.import1dArray( "../data/" + data_type + "/classify/" + data_type + "/class-all", "i") x_train, y_train, x_test, y_test = sentiment.getSplits( vectors, classes) scores = linearSVMScore(x_train, y_train, x_test, y_test) else: classes = dt.import2dArray( "../data/" + data_type + "/classify/" + data_type + "/class-all", "i") x_train, y_train, x_test, y_test = newsgroups.getSplits( vectors, classes) scores = multiClassLinearSVM(x_train, y_train, x_test, y_test) print(scores) dt.write1dArray(scores, score_fn)
def __init__(self, class_path=None, get_scores=False, randomize_finetune_weights=False, dropout_noise=None, amount_of_hidden=0, epochs=1, learn_rate=0.01, loss="mse", batch_size=1, past_model_bias_fn=None, identity_swap=False, reg=0.0, amount_of_finetune=[], output_size=25, hidden_activation="tanh", layer_init="glorot_uniform", output_activation="tanh", deep_size=None, corrupt_finetune_weights=False, split_to_use=-1, hidden_layer_size=100, file_name="unspecified_filename", vector_path=None, is_identity=False, finetune_size=0, data_type="movies", optimizer_name="rmsprop", noise=0.0, fine_tune_weights_fn=None, past_model_weights_fn=None, from_ae=True, save_outputs=False, label_names_fn="", rewrite_files=False, cv_splits=1, cutoff_start=0.2, development=False, class_weight=None, csv_fn=None, tune_vals=False, get_nnet_vectors_path=None, classification_name="all", limit_entities=False, limited_label_fn="", vector_names_fn="", identity_activation="linear", loc="../data/", lock_weights_and_redo=False): weights_fn = loc + data_type + "/nnet/weights/" + file_name + "L0.txt" bias_fn = loc + data_type + "/nnet/bias/" + file_name + "L0.txt" rank_fn = loc + data_type + "/nnet/clusters/" + file_name + ".txt" all_fns = [weights_fn, bias_fn, rank_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "nnet") return else: print("Running task", "nnet") self.class_path = class_path self.learn_rate = learn_rate self.epochs = epochs self.loss = loss self.batch_size = batch_size self.hidden_activation = hidden_activation self.layer_init = layer_init self.output_activation = output_activation self.hidden_layer_size = hidden_layer_size self.file_name = file_name self.vector_path = vector_path self.dropout_noise = dropout_noise self.finetune_size = finetune_size self.get_scores = get_scores self.reg = reg self.amount_of_finetune = amount_of_finetune self.amount_of_hidden = amount_of_hidden self.output_size = output_size self.identity_swap = identity_swap self.deep_size = deep_size self.from_ae = from_ae self.is_identity = is_identity self.randomize_finetune_weights = randomize_finetune_weights self.corrupt_finetune_weights = corrupt_finetune_weights self.deep_size = deep_size self.fine_tune_weights_fn = fine_tune_weights_fn self.identity_activation = identity_activation self.lock_weights_and_redo = lock_weights_and_redo print(data_type) if optimizer_name == "adagrad": self.optimizer = Adagrad() elif optimizer_name == "sgd": self.optimizer = SGD() elif optimizer_name == "rmsprop": self.optimizer = RMSprop() elif optimizer_name == "adam": self.optimizer = Adam() elif optimizer_name == "adadelta": self.optimizer = Adadelta() else: print("optimizer not found") exit() entity_vectors = np.asarray(dt.import2dArray(self.vector_path)) print("Imported vectors", len(entity_vectors), len(entity_vectors[0])) if get_nnet_vectors_path is not None: nnet_vectors = np.asarray(dt.import2dArray(get_nnet_vectors_path)) print("Imported vectors", len(entity_vectors), len(entity_vectors[0])) entity_classes = np.asarray(dt.import2dArray(self.class_path)) print("Imported classes", len(entity_classes), len(entity_classes[0])) if fine_tune_weights_fn is None: vector_names = dt.import1dArray(vector_names_fn) limited_labels = dt.import1dArray(limited_label_fn) entity_vectors = np.asarray( dt.match_entities(entity_vectors, limited_labels, vector_names)) if fine_tune_weights_fn is not None: if len(entity_vectors) != len(entity_classes): entity_classes = entity_classes.transpose() print("Transposed classes, now in form", len(entity_classes), len(entity_classes[0])) """ # IF Bow if len(entity_vectors[0]) != len(entity_classes[0]): entity_vectors = entity_vectors.transpose() print("Transposed vectors, now in form", len(entity_vectors), len(entity_vectors[0])) """ elif len(entity_vectors) != len(entity_classes): entity_vectors = entity_vectors.transpose() print("Transposed vectors, now in form", len(entity_vectors), len(entity_vectors[0])) self.input_size = len(entity_vectors[0]) self.output_size = len(entity_classes[0]) if fine_tune_weights_fn is not None: model_builder = self.fineTuneNetwork weights = [] if from_ae: self.past_weights = [] past_model_weights = [] for p in past_model_weights_fn: past_model_weights.append( np.asarray(dt.import2dArray(p), dtype="float64")) past_model_bias = [] for p in past_model_bias_fn: past_model_bias.append( np.asarray(dt.import1dArray(p, "f"), dtype="float64")) for p in range(len(past_model_weights)): past_model_weights[p] = np.around(past_model_weights[p], decimals=6) past_model_bias[p] = np.around(past_model_bias[p], decimals=6) for p in range(len(past_model_weights)): self.past_weights.append([]) self.past_weights[p].append(past_model_weights[p]) self.past_weights[p].append(past_model_bias[p]) for f in fine_tune_weights_fn: weights.extend(dt.import2dArray(f)) r = np.asarray(weights, dtype="float64") r = np.asarray(weights, dtype="float64") for a in range(len(r)): r[a] = np.around(r[a], decimals=6) for a in range(len(entity_classes)): entity_classes[a] = np.around(entity_classes[a], decimals=6) self.fine_tune_weights = [] self.fine_tune_weights.append(r.transpose()) self.fine_tune_weights.append( np.zeros(shape=len(r), dtype="float64")) else: model_builder = self.classifierNetwork # Converting labels to categorical f1_scores = [] accuracy_scores = [] f1_averages = [] accuracy_averages = [] original_fn = file_name x_train, y_train, x_test, y_test, x_dev, y_dev = split_data.splitData( vectors, labels[l], data_type) if development: x_test = x_dev y_test = y_dev model = model_builder() if get_scores: test_pred = model.predict(x_train).transpose() print(test_pred) highest_vals = [0.5] * len(test_pred) # Default 0.5 y_pred = model.predict(x_test).transpose() y_test = np.asarray(y_test).transpose() for y in range(len(y_pred)): y_pred[y][y_pred[y] >= highest_vals[y]] = 1 y_pred[y][y_pred[y] < highest_vals[y]] = 0 f1_array = [] accuracy_array = [] for y in range(len(y_pred)): accuracy_array.append(accuracy_score(y_test[y], y_pred[y])) f1_array.append( f1_score(y_test[y], y_pred[y], average="binary")) print(f1_array[y]) y_pred = y_pred.transpose() y_test = np.asarray(y_test).transpose() micro_average = f1_score(y_test, y_pred, average="micro") cv_f1_fn = loc + data_type + "/nnet/scores/F1 " + file_name + ".txt" cv_acc_fn = loc + data_type + "/nnet/scores/ACC " + file_name + ".txt" dt.write1dArray(f1_array, cv_f1_fn) dt.write1dArray(accuracy_array, cv_acc_fn) f1_scores.append(f1_array) accuracy_scores.append(accuracy_array) f1_average = np.average(f1_array) accuracy_average = np.average(accuracy_array) f1_averages.append(f1_average) accuracy_averages.append(accuracy_average) print("Average F1 Binary", f1_average, "Acc", accuracy_average) print("Micro Average F1", micro_average) f1_array.append(f1_average) f1_array.append(micro_average) accuracy_array.append(accuracy_average) accuracy_array.append(0.0) scores = [accuracy_array, f1_array] csv_fn = loc + data_type + "/nnet/csv/" + csv_fn + ".csv" file_names = [file_name + "ACC", file_name + "F1"] label_names = dt.import1dArray(label_names_fn) if dt.fileExists(csv_fn): print("File exists, writing to csv") try: dt.write_to_csv(csv_fn, file_names, scores) except PermissionError: print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") dt.write_to_csv( csv_fn[:len(csv_fn) - 4] + str(random.random()) + "FAIL.csv", [file_name], scores) else: print("File does not exist, recreating csv") key = [] for l in label_names: key.append(l) key.append("AVERAGE") key.append("MICRO AVERAGE") dt.write_csv(csv_fn, file_names, scores, key) if save_outputs: if limit_entities is False: self.output_clusters = model.predict(nnet_vectors) else: self.output_clusters = model.predict(entity_vectors) self.output_clusters = self.output_clusters.transpose() dt.write2dArray(self.output_clusters, rank_fn) for l in range(0, len(model.layers) - 1): if dropout_noise is not None and dropout_noise > 0.0: if l % 2 == 1: continue print("Writing", l, "layer") truncated_model = Sequential() for a in range(l + 1): truncated_model.add(model.layers[a]) truncated_model.compile(loss=self.loss, optimizer="sgd") if get_nnet_vectors_path is not None: self.end_space = truncated_model.predict(nnet_vectors) else: self.end_space = truncated_model.predict(entity_vectors) total_file_name = loc + data_type + "/nnet/spaces/" + file_name dt.write2dArray(self.end_space, total_file_name + "L" + str(l) + ".txt") for l in range(len(model.layers)): try: dt.write2dArray( model.layers[l].get_weights()[0], loc + data_type + "/nnet/weights/" + file_name + "L" + str(l) + ".txt") dt.write1dArray( model.layers[l].get_weights()[1], loc + data_type + "/nnet/bias/" + file_name + "L" + str(l) + ".txt") except IndexError: print("Layer ", str(l), "Failed")
csv_rows.append((name_array[i], acc, f1, macro_f1)) print(csv_rows[i]) with open("../data/raw/" + data_type + "/test/reps.csv", 'wt') as f: writer = csv.writer(f) writer.writerow(("name", "acc", "micro f1", "macro f1")) writer.writerows(csv_rows) if __name__ == '__main__': fn = "../data/newsgroups/bow/ppmi/class-all-" + str(30) + "-" + str( 18836) + "-" + "all.npz" print("Testing", fn) testAll( ["mds", "finetune_space", "mds_rankings", "finetune_rankings"], [ dt.import2dArray("../data/newsgroups/nnet/spaces/wvFIXED200.npy"), dt.import2dArray( "../data/newsgroups/nnet/spaces/sns_ppmi3wvFIXED200CV1S0 SFT0 allL03018836 LR kappa KMeans CA200 MC1 MS0.4 ATS2000 DS400FT BOCFi NT[200]tanh300S6040V1.2L0.npy" ), dt.import2dArray( "../data/newsgroups/rank/numeric/sns_ppmi3wvFIXED200CV1S0 SFT0 allL03018836 LR kappa KMeans CA400 MC1 MS0.4 ATS500 DS800.npy" ).transpose(), dt.import2dArray( "../data/newsgroups/nnet/clusters/sns_ppmi3wvFIXED200CV1S0 SFT0 allL03018836 LR kappa KMeans CA200 MC1 MS0.4 ATS2000 DS400FT BOCFi NT[200]tanh300S6040V1.2.npy" ).transpose() ], [ dt.import2dArray( "../data/newsgroups/classify/newsgroups/class-all", "i"), dt.import2dArray( "../data/newsgroups/classify/newsgroups/class-all", "i"),
def __init__(self, vector_path, class_path, property_names_fn, file_name, svm_type, training_size=10000, lowest_count=200, highest_count=21470000, get_kappa=True, get_f1=True, single_class=True, data_type="movies", getting_directions=True, threads=1, chunk_amt=0, chunk_id=0, rewrite_files=False, classification="all", loc="../data/", logistic_regression=False, sparse_array_fn=None, only_these_fn=None): self.get_kappa = True self.get_f1 = get_f1 self.data_type = data_type self.classification = classification self.lowest_amt = lowest_count self.higher_amt = highest_count if chunk_amt > 0: file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str( chunk_amt) directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt" ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt" kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt" acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt" TP_fn = loc + data_type + "/svm/stats/TP " + file_name + ".txt" FP_fn = loc + data_type + "/svm/stats/FP " + file_name + ".txt" TN_fn = loc + data_type + "/svm/stats/TN " + file_name + ".txt" FN_fn = loc + data_type + "/svm/stats/FN " + file_name + ".txt" all_fns = [directions_fn, kappa_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getSVMResults") return else: print("Running task", "getSVMResults") y_train = 0 y_test = 0 vectors = np.asarray(dt.import2dArray(vector_path)) print("imported vectors") if not getting_directions: classes = np.asarray(dt.import2dArray(class_path)) print("imported classes") property_names = dt.import1dArray(property_names_fn) print("imported propery names") if chunk_amt > 0: if chunk_id == chunk_amt - 1: chunk = int(len(property_names) / chunk_amt) multiply = chunk_amt - 1 property_names = property_names[chunk * multiply:] else: property_names = dt.chunks( property_names, int( (len(property_names) / chunk_amt)))[chunk_id] if sparse_array_fn is not None: sparse_array = dt.import2dArray(sparse_array_fn) else: sparse_array = None if sparse_array is not None: for s in range(len(sparse_array)): if len(np.nonzero(sparse_array[s])[0]) <= 1: print("WILL FAIL", s, len(np.nonzero(sparse_array[s])[0])) else: print(len(np.nonzero(sparse_array[s])[0])) if not getting_directions: x_train, x_test, y_train, y_test = train_test_split(vectors, classes, test_size=0.3, random_state=0) else: x_train = vectors x_test = vectors if get_f1: y_train = y_train.transpose() y_test = y_test.transpose() print("transpoosed") self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test if only_these_fn is not None: only_these = dt.import1dArray(only_these_fn, "s") inds = [] for s in range(len(property_names)): for o in only_these: if property_names[s] == o: inds.append(s) break sparse_array = sparse_array[inds] property_names = property_names[inds] if self.get_f1 is False: print("running svms") kappa_scores, directions, f1_scores, property_names, accs, TPs, FPs, TNs, FNs = self.runAllSVMs( y_test, y_train, property_names, file_name, svm_type, getting_directions, threads, logistic_regression, sparse_array) dt.write1dArray(kappa_scores, kappa_fn) dt.write2dArray(directions, directions_fn) dt.write1dArray(f1_scores, ktau_scores_fn) dt.write1dArray(accs, acc_fn) dt.write1dArray(TPs, TP_fn) dt.write1dArray(FPs, FP_fn) dt.write1dArray(TNs, TN_fn) dt.write1dArray(FNs, FN_fn) dt.write1dArray(property_names, property_names_fn + file_name + ".txt") else: final_f1 = [] final_acc = [] for y in range(len(y_train)): f1, acc = self.runClassifySVM(y_test[y], y_train[y]) print(f1, acc) final_f1.append(f1) final_acc.append(acc) dt.write1dArray(final_f1, ktau_scores_fn) dt.write1dArray(final_acc, acc_fn)