def logisticRegression(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000, sparse_freqs_fn=None, bow_names_fn=None): lr_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt" all_fns = [lr_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", bagOfClusters.__name__) return else: print("Running task", bagOfClusters.__name__) if limit_entities is False: classification = "all" cluster_names = dt.import2dArray(cluster_names_fn, "s") bow_names = dt.import1dArray(bow_names_fn, "s") sparse_freqs = dt.import2dArray(sparse_freqs_fn, return_sparse=True) frq = getLROnBag(cluster_names, data_type, lowest_amt, highest_amt, classification, file_name, bow_names, sparse_freqs) dt.write2dArray(frq, lr_fn) return frq
def bagOfClusters(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000): pavPPMI_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", bagOfClusters.__name__) return else: print("Running task", bagOfClusters.__name__) if limit_entities is False: classification = "all" ranking = dt.import2dArray(ranking_fn) names = dt.import2dArray(cluster_names_fn, "s") frq = writeBagOfClusters(names, data_type, lowest_amt, highest_amt, classification) dt.write2dArray(frq, pavPPMI_fn) return frq
def avgPPMI(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, classification="genres", lowest_amt=0, highest_amt=2147000000, limit_entities=False, save_results_so_far=False): pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist( all_fns) and not rewrite_files or save_results_so_far: print("Skipping task", avgPPMI.__name__) return else: print("Running task", avgPPMI.__name__) if limit_entities is False: classification = "all" ranking = dt.import2dArray(ranking_fn) names = dt.import2dArray(cluster_names_fn, "s") for n in range(len(names)): for x in range(len(names[n])): if ":" in names[n][x]: names[n][x] = names[n][x][:-1] frq = [] counter = 0 for n in range(len(names)): name_frq = [] for name in names[n]: name_frq.append( readPPMI(name, data_type, lowest_amt, highest_amt, classification)) avg_frq = [] name_frq = np.asarray(name_frq).transpose() for name in name_frq: avg_frq.append(np.average(name)) frq.append(np.asarray(avg_frq)) print(n) dt.write2dArray(frq, pavPPMI_fn) return frq
def bagOfClustersPavPPMI(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000, sparse_freqs_fn=None, bow_names_fn=None): pavPPMI_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", bagOfClustersPavPPMI.__name__) return else: print("Running task", bagOfClustersPavPPMI.__name__) if limit_entities is False: classification = "all" bow_names = dt.import1dArray(bow_names_fn, "s") sparse_freqs = dt.import2dArray(sparse_freqs_fn, return_sparse=True) ranking = dt.import2dArray(ranking_fn) cluster_names = dt.import2dArray(cluster_names_fn, "s") frq = getLROnBag(cluster_names, data_type, lowest_amt, highest_amt, classification, file_name, bow_names, sparse_freqs) pav_classes = [] for f in range(len(frq)): print(cluster_names[f]) x = np.asarray(frq[f]) y = ranking[f] ir = IsotonicRegression() y_ = ir.fit_transform(x, y) pav_classes.append(y_) if do_p: plot(x, y, y_) print(f) dt.write2dArray(pav_classes, pavPPMI_fn) return pav_classes
def makeTopVectors(filename): vectors = dt.import2dArray("Rankings/" + filename + ".space") top250names = dt.import1dArray("filmdata/top250.txt") film_names = dt.import1dArray("filmdata/filmNames.txt") indexes = [] ordered_names = [] for f in range(len(film_names)): for t in top250names: if film_names[f] == t: indexes.append(f) ordered_names.append(t) top_vectors = [[]] for v in range(len(vectors)): if v > 0: top_vectors.append([]) for i in range(len(vectors[v])): for id in indexes: if i == id: top_vectors[v].append(vectors[v][i]) dt.write2dArray(top_vectors, "Plots/Top174" + filename + ".space") dt.write1dArray(ordered_names, "Plots/Top174OrderedByOriginalList.txt")
def plotClusters(filename): names = dt.import1dArray("Plots/Top174OrderedByOriginalList.txt") space = dt.import2dArray("Plots/Top174" + filename + ".space") cluster_names = dt.import1dArray( "Clusters/films100N0.6H25L3CutLeastSimilarHIGH0.75,0.67.names") #svd = TruncatedSVD(n_components=2, random_state=42) cx = 8 cy = 9 x = [] y = [] for s in space[cx]: x.append(s) for s in space[cy]: y.append(s) #svd_space = svd.fit_transform(space) fig, ax = plt.subplots() ax.scatter(x, y, picker=True) #for i, name in enumerate(found_names): # ax.annotate(name, (x[i], y[i])) ax.set_xlabel(cluster_names[cx]) ax.set_ylabel(cluster_names[cy]) def onpick3(event): ind = event.ind print('onpick3 scatter:', names[ind[0]]) fig.canvas.mpl_connect('pick_event', onpick3) plt.show()
def selectCutOffByWordVector(cutoff_fn, cluster_dict_fn, file_name): cutoff = dt.import2dArray(cutoff_fn) cluster_dict = dt.readArrayDict(cluster_dict_fn) cutoff_words = [] wv, wvn = dt.getWordVectors() cluster_boundary = 2 cluster_dict_arrays = [] for key, value in cluster_dict.items(): cluster_array = [] cluster_array.append(key) for v in value: cluster_array.append(v) cluster_dict_arrays.append(cluster_array) for c in range(len(cutoff)): clusters = [] for i in range(len(cutoff[c])): cluster = [] for x in range(len(cutoff[c]) - 1, -1, -1): if cutoff[c][x] is None or cutoff[c][i] is None: continue if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary: cluster.append(cluster_dict_arrays[c][x]) cutoff[c][x] = None cluster_dict_arrays[c][x] = None if cluster is []: continue clusters.append(cluster) # Get the maximum similarity word vector value for each cluster, across all clusters for cl in range(len(clusters)): for wa in range(len(clusters[cl])): for w in range(len(clusters[cl][wa])): clusters[cl[wa]] dt.write2dArray(cutoff_words, "../data/movies/rules/cutoff/" + file_name + "WVN.txt")
def main(data_type, clf, min, max, depth, rewrite_files): dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dm" dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dmround" mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "d" + str(depth) svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "round" term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/simple_numeric_stopwords_ppmi 2-all.npz" if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]): print("all files exist") exit() #Get MDS """ tf = dt.import2dArray(term_frequency_fn).transpose() pca = sparseSVD(tf, depth) dt.write2dArray(pca, pca_fn) """ # REMINDER: np.dot is WAY faster! tf = dt.import2dArray(term_frequency_fn, return_sparse=True) dm = getDsimMatrixDense(tf) dt.write2dArray(dm, dm_fn) print("wrote dm") """ Pretty sure none of this works
def plotSVD(filename): names = dt.import1dArray("Plots/Top174OrderedByOriginalList.txt") space = dt.import2dArray("Plots/Top174" + filename + ".space") space = np.matrix.transpose(np.asarray(space)) space = space.tolist() svd = TruncatedSVD(n_components=2, random_state=42) svd_space = svd.fit_transform(space) x = [] y = [] for s in svd_space: print(s) x.append(s[0]) y.append(s[1]) fig, ax = plt.subplots() ax.scatter(x, y, picker=True) # for i, name in enumerate(found_names): # ax.annotate(name, (x[i], y[i])) def onpick3(event): ind = event.ind print('onpick3 scatter:', names[ind[0]]) fig.canvas.mpl_connect('pick_event', onpick3) plt.show()
def pavTermFrequency(ranking_fn, cluster_names_fn, fn, plot): ranking = dt.import2dArray(ranking_fn) names = dt.import1dArray(cluster_names_fn) frq = [] counter = 0 for name in names: frq.append(readFreq(name)) pav_classes = [] for f in range(len(frq)): print(names[f]) x = np.asarray(frq[f]) y = ranking[f] ir = IsotonicRegression() y_ = ir.fit_transform(x, y) pav_classes.append(y_) if plot: plot(x, y, y_) print(f) dt.write2dArray( pav_classes, "../data/movies/finetune/" + file_name + "PavTermFrequency.txt") return pav_classes
def PPMIFT(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000): pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", pavPPMI.__name__) return else: print("Running task", pavPPMI.__name__) print("certainly still running that old pavPPMI task, yes sir") if limit_entities is False: classification = "all" ranking = dt.import2dArray(ranking_fn) names = dt.import1dArray(cluster_names_fn) frq = [] counter = 0 for name in names: name = name.split()[0] if ":" in name: name = name[:-1] frq.append( readPPMI(name, data_type, lowest_amt, highest_amt, classification)) dt.write2dArray(frq, pavPPMI_fn) return frq
def match_entities(entity_fn, t_entity_fn, entities_fn, classification): names = dt.import1dArray(entity_fn) t_names = dt.import1dArray(t_entity_fn) entities = dt.import2dArray(entities_fn) indexes_to_delete = [] amount_found = 0 for n in range(len(names)): names[n] = dt.removeEverythingFromString(names[n]) for n in range(len(t_names)): t_names[n] = dt.removeEverythingFromString(t_names[n]) matched_ids = [] for n in range(len(t_names)): for ni in range(len(names)): matched_name = t_names[n] all_name = names[ni] if matched_name == all_name: print(matched_name) matched_ids.append(ni) break matched_entities = [] for e in matched_ids: matched_entities.append(entities[e]) print("Amount found", amount_found) dt.write2dArray(matched_entities, entities_fn[:len(entities_fn)-4] + "-" + classification + ".txt")
def getAllRankings(directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, percent, percentage_increment, by_vector, fn, discrete=True, data_type="movies", rewrite_files=False): #labels_fn = "../data/"+data_type+"/rank/labels/" + fn + ".txt" rankings_fn = "../data/" + data_type + "/rank/numeric/" + fn + ".txt" #discrete_labels_fn = "../data/"+data_type+"/rank/discrete/" + fn + ".txt" all_fns = [rankings_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: for f in all_fns: print(f, "Already exists") print("Skipping task", "getAllRankings") return else: print("Running task", "getAllRankings") directions = dt.import2dArray(directions_fn) vectors = dt.import2dArray(vectors_fn) cluster_names = dt.import1dArray(cluster_names_fn) vector_names = dt.import1dArray(vector_names_fn) rankings = getRankings(directions, vectors, cluster_names, vector_names) rankings = np.asarray(rankings) if discrete: labels = createLabels(rankings, percent) labels = np.asarray(labels) discrete_labels = createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) if by_vector: labels = labels.transpose() if discrete: discrete_labels = discrete_labels.transpose() rankings = rankings.transpose() if discrete: dt.write2dArray(labels, labels_fn) dt.write2dArray(rankings, rankings_fn) if discrete: dt.write2dArray(discrete_labels, discrete_labels_fn)
def main(data_type, vector_size, window_size, min_count, sampling_threshold, negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax): file_name = "Doc2Vec" + " VS" + str(vector_size) + " WS" + str(window_size) + " MC" + str(min_count) + " ST" + str( sampling_threshold) + \ " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str( worker_count) + "spacy" " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(worker_count) + \ " TW" + str(train_wv) + " CW" + str(concatenate_wv) + " HS" + str(use_hierarchical_softmax) corpus_fn = "../data/raw/" + data_type + "/corpus_processed.txt" if os.path.exists(corpus_fn) is False: x_train = np.load("../data/raw/" + data_type + "/x_train_w.npy") x_test = np.load("../data/raw/" + data_type + "/x_test_w.npy") corpus = np.concatenate((x_train, x_test), axis=0) text_corpus = np.empty(len(corpus), dtype=np.object) for i in range(len(corpus)): text_corpus[i] = " ".join(corpus[i]) print(text_corpus[i]) dt.write1dArray(text_corpus, corpus_fn) embedding_fn = "/home/tom/Downloads/glove.6B/glove.6B.300d.txt" model_fn = "../data/" + data_type + "/doc2vec/" + file_name + ".bin" vector_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + ".npy" score_fn = "../data/" + data_type + "/doc2vec/" + file_name + "catacc.score" if os.path.exists(model_fn): print("Imported model") model = g.utils.SaveLoad.load(model_fn) elif file_name[:7] == "Doc2Vec": model = doc2Vec(embedding_fn, corpus_fn, vector_size, window_size, min_count, sampling_threshold, negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax) model.save(model_fn) if os.path.exists(vector_fn) is False: vectors = [] for d in range(len(model.docvecs)): vectors.append(model.docvecs[d]) np.save(vector_fn, vectors) else: print("Imported vectors") vectors = np.load(vector_fn) if os.path.exists(score_fn) is False or file_name[:6] != "Doc2Vec": print("Getting score") if data_type == "sentiment": classes = dt.import1dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i") x_train, y_train, x_test, y_test = sentiment.getSplits(vectors, classes) scores = linearSVMScore(x_train, y_train, x_test, y_test) else: classes = dt.import2dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i") x_train, y_train, x_test, y_test = newsgroups.getSplits(vectors, classes) scores = multiClassLinearSVM(x_train, y_train, x_test, y_test) print(scores) dt.write1dArray(scores, score_fn)
def convertToTfIDF(data_type, lowest_count, highest_count, freq_arrays_fn, class_type): freq = np.asarray(dt.import2dArray(freq_arrays_fn)) v = TfidfTransformer() x = v.fit_transform(freq) x = x.toarray() dt.write2dArray(x, "../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type)) dt.writeClassAll("../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type), "../data/"+data_type+"/bow/names/"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type)+".txt", "../data/"+data_type+"/bow/names/"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type)+".txt", "../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type))
def pavPPMI(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000): pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", pavPPMI.__name__) return else: print("Running task", pavPPMI.__name__) print("certainly still running that old pavPPMI task, yes sir") if limit_entities is False: classification = "all" ranking = dt.import2dArray(ranking_fn) names = dt.import1dArray(cluster_names_fn) frq = [] counter = 0 for name in names: name = name.split()[0] if ":" in name: name = name[:-1] frq.append( readPPMI(name, data_type, lowest_amt, highest_amt, classification)) pav_classes = [] for f in range(len(frq)): try: print(names[f]) x = np.asarray(frq[f]) y = ranking[f] ir = IsotonicRegression() y_ = ir.fit_transform(x, y) pav_classes.append(y_) if do_p: plot(x, y, y_) except ValueError: print(names[f], "len ppmi", len(frq[f], "len ranking", len(ranking[f]))) exit() print(f) dt.write2dArray(pav_classes, pavPPMI_fn) return pav_classes
def selectCutOffByExplanation(cutoff_fn, cluster_dict_fn, file_name): cutoff = dt.import2dArray(cutoff_fn) dupe_cutoff = copy.deepcopy(cutoff) cluster_dict = dt.readArrayDict(cluster_dict_fn) cutoff_words = [] cluster_boundary = 2 cluster_dict_arrays = [] for key, value in cluster_dict.items(): cluster_array = [] cluster_array.append(key) for v in value: cluster_array.append(v) cluster_dict_arrays.append(cluster_array) explanations = [] explanation_cutoffs = [] for c in range(len(cutoff)): clusters = [] for i in range(len(cutoff[c])): cluster = [] for x in range(len(cutoff[c]) - 1, -1, -1): if cutoff[c][x] is None or cutoff[c][i] is None: continue if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary: cluster.append(cluster_dict_arrays[c][x]) cutoff[c][x] = None cluster_dict_arrays[c][x] = None if cluster is []: continue clusters.append(cluster) # Get the m vvcaximum similarity word vector value for each cluster, across all clusters # For each cluster explained_cutoff = [] explained_cutoff_value = [] for cl in range(len(clusters)): if len(clusters[cl]) == 0: print("Skipped") continue cluster_explanation, winning_index = webapi.getHighestScore( clusters[cl]) explained_cutoff.append(cluster_explanation + ",") dict_index = 0 for h in range(len(cluster_dict_arrays[cl])): if cluster_dict_arrays[cl][h] == clusters[cl][winning_index]: dict_index = h explained_cutoff_value.append(dupe_cutoff[cl][dict_index]) explanations.append(explained_cutoff) explanation_cutoffs.append(explained_cutoff_value) dt.write2dArray( explanations, "../data/movies/rules/final_names/" + file_name + "WVN.txt") dt.write2dArray(explanation_cutoffs, "../data/movies/rules/final_cutoff/" + file_name + ".txt")
def saveClusters(directions_fn, scores_fn, names_fn, filename, amt_of_dirs, data_type, cluster_amt, rewrite_files=False, algorithm="meanshift_k"): dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt" cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt" all_fns = [dict_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", saveClusters.__name__) return else: print("Running task", saveClusters.__name__) p_dir = dt.import2dArray(directions_fn) p_names = dt.import1dArray(names_fn, "s") p_scores = dt.import1dArray(scores_fn, "f") ids = np.argsort(p_scores) p_dir = np.flipud(p_dir[ids])[:amt_of_dirs] p_names = np.flipud(p_names[ids])[:amt_of_dirs] if algorithm == "meanshift": labels = meanShift(p_dir) else: labels = kMeans(p_dir, cluster_amt) unique, counts = np.unique(labels, return_counts=True) clusters = [] dir_clusters = [] for i in range(len(unique)): clusters.append([]) dir_clusters.append([]) for i in range(len(labels)): clusters[labels[i]].append(p_names[i]) dir_clusters[labels[i]].append(p_dir[i]) cluster_directions = [] for l in range(len(dir_clusters)): cluster_directions.append(dt.mean_of_array(dir_clusters[l])) print("------------------------") for c in clusters: print(c) print("------------------------") dt.write2dArray(clusters, dict_fn) dt.write2dArray(cluster_directions, cluster_directions_fn)
def getAllPhraseRankings(directions_fn=None, vectors_fn=None, property_names_fn=None, vector_names_fn=None, fn="no filename", percentage_increment=1, scores_fn=None, top_amt=0, discrete=False, data_type="movies", rewrite_files=False): rankings_fn_all = "../data/" + data_type + "/rank/numeric/" + fn + "ALL.txt" all_fns = [rankings_fn_all] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getAllPhraseRankings") return else: print("Running task", "getAllPhraseRankings") directions = dt.import2dArray(directions_fn) vectors = dt.import2dArray(vectors_fn) property_names = dt.import1dArray(property_names_fn) vector_names = dt.import1dArray(vector_names_fn) if top_amt != 0: scores = dt.import1dArray(scores_fn, "f") directions = dt.sortByReverseArray(directions, scores)[:top_amt] property_names = dt.sortByReverseArray(property_names, scores)[:top_amt] rankings = getRankings(directions, vectors, property_names, vector_names) if discrete: discrete_labels = createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) for a in range(len(rankings)): rankings[a] = np.around(rankings[a], decimals=4) #dt.write1dArray(property_names, "../data/movies/bow/names/top5kof17k.txt") dt.write2dArray(rankings, rankings_fn_all)
def plotTopVectors(filename): names = dt.import1dArray( "../data/movies/plot/Top174OrderedByOriginalList.txt") space = dt.import2dArray("../data/movies/plot/Top174" + filename + ".space") svd = TruncatedSVD(n_components=2, random_state=42) svd_space = svd.fit_transform(space) pl.plot(space[0], 'rx') pl.show() """
def trimRankings(rankings_fn, available_indexes_fn, names, folder_name): available_indexes = dt.import1dArray(available_indexes_fn) rankings = np.asarray(dt.import2dArray(rankings_fn)) names = dt.import1dArray(names) trimmed_rankings = [] for r in range(len(rankings)): trimmed = rankings[r].take(available_indexes) trimmed_rankings.append(trimmed) for a in range(len(trimmed_rankings)): print("Writing", names[a]) dt.write1dArray(trimmed_rankings[a], folder_name + "class-" + names[a]) print("Writing", rankings_fn[-6:]) dt.write2dArray(trimmed_rankings, folder_name + "class-" + rankings_fn[-6:])
def getNDCG(rankings_fn, fn, data_type, bow_fn, ppmi_fn, lowest_count, rewrite_files=False, highest_count=0, classification=""): # Check if the NDCG scores have already been calculated, if they have then skip. ndcg_fn = "../data/" + data_type + "/ndcg/" + fn + ".txt" all_fns = [ndcg_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", getNDCG.__name__) return else: print("Running task", getNDCG.__name__) # Get the file names for the PPMI values for every word and a list of words ("names") names = dt.import1dArray("../data/" + data_type + "/bow/names/" + bow_fn) ppmi = dt.import2dArray("../data/" + data_type + "/bow/ppmi/" + ppmi_fn) # Process the rankings and the PPMI line-by-line so as to not run out of memory ndcg_a = [] #spearman_a = [] with open(rankings_fn) as rankings: r = 0 for lr in rankings: for lp in ppmi: # Get the plain-number ranking of the rankings, e.g. "1, 4, 3, 50" sorted_indices = np.argsort( list(map(float, lr.strip().split())))[::-1] # Convert PPMI scores to floats # Get the NDCG score for the PPMI score, which is a valuation, compared to the indice of the rank ndcg = ndcg_from_ranking(lp, sorted_indices) # Add to array and print ndcg_a.append(ndcg) print("ndcg", ndcg, names[r], r) """ smr = spearmanr(ppmi_indices, sorted_indices)[1] spearman_a.append(smr) print("spearman", smr, names[r], r) """ r += 1 break # Save NDCG dt.write1dArray(ndcg_a, ndcg_fn)
def makeTopVectorsDirections(filename): vectors = dt.import2dArray("Directions/" + filename + "Cut.directions") top250names = dt.import1dArray("filmdata/top250.txt") filmnames = dt.import1dArray("filmdata/filmNames.txt") top250vectors = [] for f in range(len(filmnames)): for t in range(len(top250names)): if filmnames[f] == top250names[t]: top250vectors.append(vectors[t]) dt.write2dArray(top250vectors, "../data/movies/plot/t250" + filename + ".directions")
def denoisingAutoencoder(self, noise, deep_size): entity_vectors = np.asarray(dt.import2dArray(self.vector_path)) if len(entity_vectors) != 15000: entity_vectors = entity_vectors.transpose() if self.class_path is None: entity_classes = entity_vectors else: entity_classes = np.asarray(dt.import2dArray(self.class_path)) input_size = len(entity_vectors[0]) output_size = len(entity_classes[0]) if self.dropout_noise is None: self.model.add(GaussianNoise(noise, input_shape=(input_size, ))) else: self.model.add( Dropout(self.dropout_noise[0], input_shape=(input_size, ))) if deep_size is not None: self.model.add( Dense(output_dim=deep_size, input_dim=self.hidden_layer_size, init=self.layer_init, activation=self.hidden_activation, W_regularizer=l2(self.reg), activity_regularizer=activity_l2(self.activity_reg))) self.model.add( Dense(output_dim=self.hidden_layer_size, input_dim=input_size, init=self.layer_init, activation=self.hidden_activation, W_regularizer=l2(self.reg))) self.model.add( Dense(output_dim=output_size, init=self.layer_init, activation=self.output_activation, W_regularizer=l2(self.reg))) self.model.compile(loss=self.loss, optimizer=self.optimizer) return entity_vectors, entity_classes
def main(data_type, clf, highest_amt, lowest_amt, depth, rewrite_files): min = lowest_amt max = highest_amt dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dm" dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dmround" mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "d" + str(depth) svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "round" term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \ + "-" + clf if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]): print("all files exist") exit() if dt.fileExists(dm_fn) is False: newsgroups_train = fetch_20newsgroups(subset='train', shuffle=False) newsgroups_test = fetch_20newsgroups(subset='test', shuffle=False) vectors = np.concatenate((newsgroups_train.data, newsgroups_test.data), axis=0) newsgroups_test = None newsgroups_train = None # Get sparse tf rep tf_vectorizer = CountVectorizer(max_df=highest_amt, min_df=lowest_amt, stop_words='english') print("completed vectorizer") tf = tf_vectorizer.fit_transform(vectors) vectors = None # Get sparse PPMI rep from sparse tf rep print("done ppmisaprse") sparse_ppmi = convertPPMISparse(tf) # Get sparse Dsim matrix from sparse PPMI rep dm = getDissimilarityMatrixSparse(sparse_ppmi) dt.write2dArray(dm, dm_fn) else: dm = dt.import2dArray(dm_fn) print("starting mds") # Use as input to mds mds = createMDS(dm, depth) # save MDS dt.write2dArray(mds, mds_fn)
def getDissimilarityMatrixSparse(tf): tflen = tf.shape[0] dm = np.empty([tflen, tflen], dtype="float64") pithing = 2/pi norms = np.empty(tflen, dtype="float64") #Calculate norms for ei in range(tflen): norms[ei] = spl.norm(tf[ei]) print("norm", ei) dot_product = np.zeros([tflen, tflen], dtype="float64") use_old_dp = True if use_old_dp: dot_product = dt.import2dArray("dotproduct.temp") else: #Calculate dot products for ei in range(tflen): for ej in range(tflen): if dot_product[ej][ei] != 0: dot_product[ei][ej] = dot_product[ej][ei] continue dot_product[ei][ej] = tf[ei].dot(tf[ej].T)[0,0] print("dp", ei) dt.write2dArray(dot_product, "dotproduct.temp") norm_multiplied = np.empty([tflen, tflen], dtype="float64") # Calculate dot products for ei in range(tflen): for ej in range(tflen): norm_multiplied[ei][ej] = norms[ei] * norms[ej] print("dp", ei) norm_multiplied = dt.shortenFloatsNoFn(norm_multiplied) dot_product = dt.shortenFloatsNoFn(dot_product) #Get angular differences for ei in range(tflen): for ej in range(tflen): ang = pithing * np.arccos(dot_product[ei][ej] / norm_multiplied[ei][ej]) dm[ei][ej] = ang print(ei) return dm
def main(min, max, data_type, raw_fn, extension, cut_first_line, additional_name, make_individual, entity_name_fn, use_all_files, sparse_matrix, word_count_amt, classification): getVectors(raw_fn, entity_name_fn, extension, "../data/"+data_type+"/bow/", min, max, cut_first_line, get_all, additional_name, make_individual, classification, use_all_files, 1000, data_type, sparse_matrix) bow = sp.csr_matrix(dt.import2dArray("../data/"+data_type+"/bow/frequency/phrases/class-all-"+str(min)+"-" + str(max)+"-"+classification)) dt.write2dArray(convertPPMI( bow), "../data/"+data_type+"/bow/ppmi/class-all-"+str(min)+"-"+str(max)+"-" + classification) print("indiviual from all") printIndividualFromAll(data_type, "ppmi", min, max, classification) printIndividualFromAll(data_type, "binary/phrases", min, max, classification) convertToTfIDF(data_type, min, max, "../data/"+data_type+"/bow/frequency/phrases/class-all-"+str(min)+"-"+str(max)+"-"+classification, classification) printIndividualFromAll(data_type, "tfidf", min, max, classification)
def getCutOff(cluster_dict_fn, rankings_fn, file_name): cluster_dict = dt.readArrayDict(cluster_dict_fn) rankings = dt.importDiscreteVectors(rankings_fn) for r in rankings: for a in range(len(r)): r[a] = int(r[a][:-1]) cutoff_clusters = [] counter = 0 for key, value in cluster_dict.items(): value.insert(0, key) cutoffs = [] for v in value: max_score = 0 cutoff = 0 for i in range(1, 101): y_pred = [] for ve in range(len(rankings[counter])): rank = rankings[counter][ve] if rank > i: y_pred.append(0) else: y_pred.append(1) y_test = dt.import2dArray( "../data/movies/bow/frequency/phrases/class-" + v, "s") score = cohen_kappa_score(y_test, y_pred) print(v, int(i), "Score", score) if score > max_score: max_score = score cutoff = i cutoffs.append(cutoff) print("Cutoff for", v, "On", key, "Was", str(cutoff)) cutoff_clusters.append(cutoffs) counter += 1 dt.write2dArray(cutoff_clusters, "../data/movies/rules/cutoff/" + file_name + ".txt")
def __init__(self, vector_path, class_path, property_names_fn, file_name, svm_type, training_size=10000, lowest_count=200, highest_count=21470000, get_kappa=True, get_f1=True, single_class=True, data_type="movies", getting_directions=True, threads=1, chunk_amt=0, chunk_id=0, rewrite_files=False, classification="all", loc="../data/"): self.get_kappa = True self.get_f1 = get_f1 self.data_type = data_type self.classification = classification self.lowest_amt = lowest_count self.higher_amt = highest_count if chunk_amt > 0: file_name = file_name + " CID" + str(chunk_id) + " CAMT" + str( chunk_amt) directions_fn = loc + data_type + "/svm/directions/" + file_name + ".txt" ktau_scores_fn = loc + data_type + "/svm/f1/" + file_name + ".txt" kappa_fn = loc + data_type + "/svm/kappa/" + file_name + ".txt" acc_fn = loc + data_type + "/svm/acc/" + file_name + ".txt" all_fns = [directions_fn, kappa_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getSVMResults") return else: print("Running task", "getSVMResults") y_train = 0 y_test = 0 vectors = np.asarray(dt.import2dArray(vector_path)) print("imported vectors") if not getting_directions: classes = np.asarray(dt.import2dArray(class_path)) print("imported classes") property_names = dt.import1dArray(property_names_fn) print("imported propery names") if chunk_amt > 0: if chunk_id == chunk_amt - 1: chunk = int(len(property_names) / chunk_amt) multiply = chunk_amt - 1 property_names = property_names[chunk * multiply:] else: property_names = dt.chunks( property_names, int( (len(property_names) / chunk_amt)))[chunk_id] if not getting_directions: x_train, x_test, y_train, y_test = train_test_split(vectors, classes, test_size=0.3, random_state=0) else: x_train = vectors x_test = vectors if get_f1: y_train = y_train.transpose() y_test = y_test.transpose() print("transpoosed") self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test if self.get_f1 is False: print("running svms") kappa_scores, directions, ktau_scores, property_names = self.runAllSVMs( y_test, y_train, property_names, file_name, svm_type, getting_directions, threads) dt.write1dArray(kappa_scores, kappa_fn) dt.write2dArray(directions, directions_fn) dt.write1dArray(ktau_scores, ktau_scores_fn) dt.write1dArray(property_names, property_names_fn + file_name + ".txt") else: final_f1 = [] final_acc = [] for y in range(len(y_train)): f1, acc = self.runClassifySVM(y_test[y], y_train[y]) print(f1, acc) final_f1.append(f1) final_acc.append(acc) dt.write1dArray(final_f1, ktau_scores_fn) dt.write1dArray(final_acc, acc_fn)
def __init__(self, features_fn, classes_fn, class_names_fn, cluster_names_fn, filename, max_depth=None, balance=None, criterion="entropy", save_details=False, data_type="movies", cv_splits=5, csv_fn="../data/temp/no_csv_provided.csv", rewrite_files=True, split_to_use=-1, development=False, limit_entities=False, limited_label_fn=None, vector_names_fn=None, pruning=1, save_results_so_far=False): vectors = np.asarray(dt.import2dArray(features_fn)).transpose() labels = np.asarray(dt.import2dArray(classes_fn, "i")) print("vectors", len(vectors), len(vectors[0])) print("labels", len(labels), len(labels[0])) print("vectors", len(vectors), len(vectors[0])) cluster_names = dt.import1dArray(cluster_names_fn) label_names = dt.import1dArray(class_names_fn) all_fns = [] file_names = ['ACC J48' + filename, 'F1 J48' + filename] acc_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[ 0] + '.scores' f1_fn = '../data/' + data_type + '/rules/tree_scores/' + file_names[ 1] + '.scores' all_fns.append(acc_fn) all_fns.append(f1_fn) all_fns.append(csv_fn) print(dt.allFnsAlreadyExist(all_fns), rewrite_files) if dt.allFnsAlreadyExist( all_fns) and not rewrite_files or save_results_so_far: print("Skipping task", "Weka Tree") return else: print("Running task", "Weka Tree") for l in range(len(cluster_names)): cluster_names[l] = cluster_names[l].split()[0] """ for l in range(len(label_names)): if label_names[l][:6] == "class-": label_names[l] = label_names[l][6:] """ f1_array = [] accuracy_array = [] labels = labels.transpose() print("labels transposed") print("labels", len(labels), len(labels[0])) if limit_entities is False: vector_names = dt.import1dArray(vector_names_fn) limited_labels = dt.import1dArray(limited_label_fn) vectors = np.asarray( dt.match_entities(vectors, limited_labels, vector_names)) all_y_test = [] all_predictions = [] for l in range(len(labels)): if balance: new_vectors, new_labels = dt.balanceClasses(vectors, labels[l]) else: new_vectors = vectors new_labels = labels[l] # Select training data with cross validation ac_y_test = [] ac_y_train = [] ac_x_train = [] ac_x_test = [] ac_y_dev = [] ac_x_dev = [] cv_f1 = [] cv_acc = [] if cv_splits == 1: kf = KFold(n_splits=3, shuffle=False, random_state=None) else: kf = KFold(n_splits=cv_splits, shuffle=False, random_state=None) c = 0 for train, test in kf.split(new_vectors): if split_to_use > -1: if c != split_to_use: c += 1 continue ac_y_test.append(new_labels[test]) ac_y_train.append(new_labels[train[int(len(train) * 0.2):]]) val = int(len(train) * 0.2) t_val = train[val:] nv_t_val = new_vectors[t_val] ac_x_train.append(nv_t_val) ac_x_test.append(new_vectors[test]) ac_x_dev.append(new_vectors[train[:int(len(train) * 0.2)]]) ac_y_dev.append(new_labels[train[:int(len(train) * 0.2)]]) c += 1 if cv_splits == 1: break predictions = [] rules = [] if development: ac_x_test = np.copy(np.asarray(ac_x_dev)) ac_y_test = np.copy(np.asarray(ac_y_dev)) train_fn = "../data/" + data_type + "/weka/data/" + filename + "Train.txt" test_fn = "../data/" + data_type + "/weka/data/" + filename + "Test.txt" for splits in range(len(ac_y_test)): # Get the weka predictions dt.writeArff(ac_x_train[splits], [ac_y_train[splits]], [label_names[splits]], train_fn, header=True) dt.writeArff(ac_x_test[splits], [ac_y_test[splits]], [label_names[splits]], test_fn, header=True) prediction, rule = self.getWekaPredictions( train_fn + label_names[splits] + ".arff", test_fn + label_names[splits] + ".arff", save_details, pruning) predictions.append(prediction) rules.append(rule) for i in range(len(predictions)): if len(predictions) == 1: all_y_test.append(ac_y_test[i]) all_predictions.append(predictions[i]) f1 = f1_score(ac_y_test[i], predictions[i], average="binary") accuracy = accuracy_score(ac_y_test[i], predictions[i]) cv_f1.append(f1) cv_acc.append(accuracy) scores = [[label_names[l], "f1", f1, "accuracy", accuracy]] print(scores) # Export a tree for each label predicted by the clf, not sure if this is needed... if save_details: data_fn = "../data/" + data_type + "/rules/weka_rules/" + label_names[ l] + " " + filename + ".txt" class_names = [label_names[l], "NOT " + label_names[l]] #self.get_code(clf, cluster_names, class_names, label_names[l] + " " + filename, data_type) dt.write1dArray(rules[i].split("\n"), data_fn) dot_file = dt.import1dArray(data_fn) new_dot_file = [] for line in dot_file: if "->" not in line and "label" in line and '"t ' not in line and '"f ' not in line: line = line.split('"') line[1] = '"' + cluster_names[int(line[1])] + '"' line = "".join(line) new_dot_file.append(line) dt.write1dArray(new_dot_file, data_fn) graph = pydot.graph_from_dot_file(data_fn) graph.write_png("../data/" + data_type + "/rules/weka_images/" + label_names[l] + " " + filename + ".png") f1_array.append(np.average(np.asarray(cv_f1))) accuracy_array.append(np.average(np.asarray(cv_acc))) accuracy_array = np.asarray(accuracy_array) accuracy_average = np.average(accuracy_array) accuracy_array = accuracy_array.tolist() f1_array = np.asarray(f1_array) f1_average = np.average(f1_array) f1_array = f1_array.tolist() micro_average = f1_score(np.asarray(all_y_test), np.asarray(all_predictions), average="micro") print("Micro F1", micro_average) accuracy_array.append(accuracy_average) accuracy_array.append(0.0) f1_array.append(f1_average) f1_array.append(micro_average) scores = [accuracy_array, f1_array] dt.write1dArray(accuracy_array, acc_fn) dt.write1dArray(f1_array, f1_fn) print(csv_fn) if dt.fileExists(csv_fn): print("File exists, writing to csv") try: dt.write_to_csv(csv_fn, file_names, scores) except PermissionError: print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") print("CSV FILE WAS OPEN, WRITING TO ANOTHER FILE") dt.write_to_csv( csv_fn[:len(csv_fn) - 4] + str(random.random()) + "FAIL.csv", file_names, scores) else: print("File does not exist, recreating csv") key = [] for l in label_names: key.append(l) key.append("AVERAGE") key.append("MICRO AVERAGE") dt.write_csv(csv_fn, file_names, scores, key)