def LDA(tf, names, components, file_name, doc_topic_prior, topic_word_prior, data_type, rewrite_files): # Removed model name as it was unused and I manually renamed a bunch of files and was too lazy to do model too rep_name = "../data/" + data_type + "/LDA/rep/" + file_name + ".txt" model_name = "../data/" + data_type + "/LDA/model/" + file_name + ".txt" names_name = "../data/" + data_type + "/LDA/names/" + file_name + ".txt" all_names = [rep_name, names_name] if dt.allFnsAlreadyExist(all_names) and not rewrite_files: print("Already completed") return print(len(tf), print(len(tf[0]))) print("Fitting LDA models with tf features,") lda = LatentDirichletAllocation(doc_topic_prior=doc_topic_prior, topic_word_prior=topic_word_prior, n_topics=components) t0 = time() tf = np.asarray(tf).transpose() new_rep = lda.fit_transform(tf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in LDA model:") topics = print_top_words(lda, names) topics.reverse() dt.write1dArray( topics, "../data/" + data_type + "/LDA/names/" + file_name + ".txt") dt.write2dArray(new_rep.transpose(), rep_name) joblib.dump(lda, model_name)
def match_entities(entity_fn, t_entity_fn, entities_fn, classification): names = dt.import1dArray(entity_fn) t_names = dt.import1dArray(t_entity_fn) entities = dt.import2dArray(entities_fn) indexes_to_delete = [] amount_found = 0 for n in range(len(names)): names[n] = dt.removeEverythingFromString(names[n]) for n in range(len(t_names)): t_names[n] = dt.removeEverythingFromString(t_names[n]) matched_ids = [] for n in range(len(t_names)): for ni in range(len(names)): matched_name = t_names[n] all_name = names[ni] if matched_name == all_name: print(matched_name) matched_ids.append(ni) break matched_entities = [] for e in matched_ids: matched_entities.append(entities[e]) print("Amount found", amount_found) dt.write2dArray(matched_entities, entities_fn[:len(entities_fn)-4] + "-" + classification + ".txt")
def main(data_type, clf, min, max, depth, rewrite_files): dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dm" dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dmround" mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "d" + str(depth) svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "round" term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/simple_numeric_stopwords_ppmi 2-all.npz" if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]): print("all files exist") exit() #Get MDS """ tf = dt.import2dArray(term_frequency_fn).transpose() pca = sparseSVD(tf, depth) dt.write2dArray(pca, pca_fn) """ # REMINDER: np.dot is WAY faster! tf = dt.import2dArray(term_frequency_fn, return_sparse=True) dm = getDsimMatrixDense(tf) dt.write2dArray(dm, dm_fn) print("wrote dm") """ Pretty sure none of this works
def pavTermFrequency(ranking_fn, cluster_names_fn, fn, plot): ranking = dt.import2dArray(ranking_fn) names = dt.import1dArray(cluster_names_fn) frq = [] counter = 0 for name in names: frq.append(readFreq(name)) pav_classes = [] for f in range(len(frq)): print(names[f]) x = np.asarray(frq[f]) y = ranking[f] ir = IsotonicRegression() y_ = ir.fit_transform(x, y) pav_classes.append(y_) if plot: plot(x, y, y_) print(f) dt.write2dArray( pav_classes, "../data/movies/finetune/" + file_name + "PavTermFrequency.txt") return pav_classes
def selectCutOffByWordVector(cutoff_fn, cluster_dict_fn, file_name): cutoff = dt.import2dArray(cutoff_fn) cluster_dict = dt.readArrayDict(cluster_dict_fn) cutoff_words = [] wv, wvn = dt.getWordVectors() cluster_boundary = 2 cluster_dict_arrays = [] for key, value in cluster_dict.items(): cluster_array = [] cluster_array.append(key) for v in value: cluster_array.append(v) cluster_dict_arrays.append(cluster_array) for c in range(len(cutoff)): clusters = [] for i in range(len(cutoff[c])): cluster = [] for x in range(len(cutoff[c]) - 1, -1, -1): if cutoff[c][x] is None or cutoff[c][i] is None: continue if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary: cluster.append(cluster_dict_arrays[c][x]) cutoff[c][x] = None cluster_dict_arrays[c][x] = None if cluster is []: continue clusters.append(cluster) # Get the maximum similarity word vector value for each cluster, across all clusters for cl in range(len(clusters)): for wa in range(len(clusters[cl])): for w in range(len(clusters[cl][wa])): clusters[cl[wa]] dt.write2dArray(cutoff_words, "../data/movies/rules/cutoff/" + file_name + "WVN.txt")
def logisticRegression(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000, sparse_freqs_fn=None, bow_names_fn=None): lr_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt" all_fns = [lr_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", bagOfClusters.__name__) return else: print("Running task", bagOfClusters.__name__) if limit_entities is False: classification = "all" cluster_names = dt.import2dArray(cluster_names_fn, "s") bow_names = dt.import1dArray(bow_names_fn, "s") sparse_freqs = dt.import2dArray(sparse_freqs_fn, return_sparse=True) frq = getLROnBag(cluster_names, data_type, lowest_amt, highest_amt, classification, file_name, bow_names, sparse_freqs) dt.write2dArray(frq, lr_fn) return frq
def getLROnBag(cluster_dict, data_type, lowest_amt, highest_amt, classification, file_name, names, sparse_freqs): bag_of_clusters = [] # Note, prior we used the PPMI values directly here somehow... for c in range(len(cluster_dict)): # Remove the colons for f in range(len(cluster_dict[c])): if ":" in cluster_dict[c][f]: cluster_dict[c][f] = cluster_dict[c][f][:-1] # Add all of the frequences together to make a bag-of-clusters name = cluster_dict[c][0] word_array = sparse_freqs[np.where(names == name)].toarray() accum_freqs = np.zeros(shape=len(word_array), dtype=np.int64) # For all the cluster terms for name in cluster_dict[c]: if ":" in name: name = name[:-1] # Import the class class_to_add = sparse_freqs[np.where(names == name)].toarray() # Add the current class to the older one accum_freqs = np.add(accum_freqs, class_to_add) # Append this clusters frequences to the group of them bag_of_clusters.append(accum_freqs) # Convert to binary for c in range(len(bag_of_clusters)): bag_of_clusters[c][bag_of_clusters[c] > 1] = 1 bag_of_clusters[c] = bag_of_clusters[c][ 0] # For some reason the internal arrays are the single element of another array dt.write2dArray(bag_of_clusters, "../data/" + data_type + "/bow/boc/" + file_name + ".txt") return bag_of_clusters
def bagOfClusters(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000): pavPPMI_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", bagOfClusters.__name__) return else: print("Running task", bagOfClusters.__name__) if limit_entities is False: classification = "all" ranking = dt.import2dArray(ranking_fn) names = dt.import2dArray(cluster_names_fn, "s") frq = writeBagOfClusters(names, data_type, lowest_amt, highest_amt, classification) dt.write2dArray(frq, pavPPMI_fn) return frq
def makeTopVectors(filename): vectors = dt.import2dArray("Rankings/" + filename + ".space") top250names = dt.import1dArray("filmdata/top250.txt") film_names = dt.import1dArray("filmdata/filmNames.txt") indexes = [] ordered_names = [] for f in range(len(film_names)): for t in top250names: if film_names[f] == t: indexes.append(f) ordered_names.append(t) top_vectors = [[]] for v in range(len(vectors)): if v > 0: top_vectors.append([]) for i in range(len(vectors[v])): for id in indexes: if i == id: top_vectors[v].append(vectors[v][i]) dt.write2dArray(top_vectors, "Plots/Top174" + filename + ".space") dt.write1dArray(ordered_names, "Plots/Top174OrderedByOriginalList.txt")
def PPMIFT(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000): pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", pavPPMI.__name__) return else: print("Running task", pavPPMI.__name__) print("certainly still running that old pavPPMI task, yes sir") if limit_entities is False: classification = "all" ranking = dt.import2dArray(ranking_fn) names = dt.import1dArray(cluster_names_fn) frq = [] counter = 0 for name in names: name = name.split()[0] if ":" in name: name = name[:-1] frq.append( readPPMI(name, data_type, lowest_amt, highest_amt, classification)) dt.write2dArray(frq, pavPPMI_fn) return frq
def binaryClusterTerm(cluster_names_fn, fn): all_cluster_output = [] cluster_names = dt.import1dArray(cluster_names_fn) for cn in cluster_names: binary = dt.import1dArray( "../data/movies/bow/binary/phrases/class-" + cn, "i") all_cluster_output.append(binary) dt.write2dArray(all_cluster_output, "../data/movies/finetune/" + fn + "ClusterTerm.txt")
def PPMI(cluster_names_fn, fn): all_cluster_output = [] cluster_names = dt.import1dArray(cluster_names_fn) for cn in cluster_names: binary = dt.import1dArray("../data/movies/bow/ppmi/class-class-" + cn, "f") all_cluster_output.append(binary) dt.write2dArray(all_cluster_output, "../data/movies/finetune/" + fn + "PPMI.txt")
def fixCutoffFormatting(cutoff_fn, file_name): cutoff = dt.import1dArray(cutoff_fn) cluster_dict = dt.readArrayDict(cluster_dict_fn) for c in range(len(cutoff)): cutoff[c] = cutoff[c].split() for i in range(len(cutoff[c])): cutoff[c][i] = int(dt.stripPunctuation(cutoff[c][i])) dt.write2dArray(cutoff, "../data/movies/rules/cutoff/" + file_name + ".txt")
def convertToTfIDF(data_type, lowest_count, highest_count, freq_arrays_fn, class_type): freq = np.asarray(dt.import2dArray(freq_arrays_fn)) v = TfidfTransformer() x = v.fit_transform(freq) x = x.toarray() dt.write2dArray(x, "../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type)) dt.writeClassAll("../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type), "../data/"+data_type+"/bow/names/"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type)+".txt", "../data/"+data_type+"/bow/names/"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type)+".txt", "../data/"+data_type+"/bow/tfidf/class-all-"+str(lowest_count)+"-"+str(highest_count)+"-"+str(class_type))
def pavPPMI(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000): pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", pavPPMI.__name__) return else: print("Running task", pavPPMI.__name__) print("certainly still running that old pavPPMI task, yes sir") if limit_entities is False: classification = "all" ranking = dt.import2dArray(ranking_fn) names = dt.import1dArray(cluster_names_fn) frq = [] counter = 0 for name in names: name = name.split()[0] if ":" in name: name = name[:-1] frq.append( readPPMI(name, data_type, lowest_amt, highest_amt, classification)) pav_classes = [] for f in range(len(frq)): try: print(names[f]) x = np.asarray(frq[f]) y = ranking[f] ir = IsotonicRegression() y_ = ir.fit_transform(x, y) pav_classes.append(y_) if do_p: plot(x, y, y_) except ValueError: print(names[f], "len ppmi", len(frq[f], "len ranking", len(ranking[f]))) exit() print(f) dt.write2dArray(pav_classes, pavPPMI_fn) return pav_classes
def selectCutOffByExplanation(cutoff_fn, cluster_dict_fn, file_name): cutoff = dt.import2dArray(cutoff_fn) dupe_cutoff = copy.deepcopy(cutoff) cluster_dict = dt.readArrayDict(cluster_dict_fn) cutoff_words = [] cluster_boundary = 2 cluster_dict_arrays = [] for key, value in cluster_dict.items(): cluster_array = [] cluster_array.append(key) for v in value: cluster_array.append(v) cluster_dict_arrays.append(cluster_array) explanations = [] explanation_cutoffs = [] for c in range(len(cutoff)): clusters = [] for i in range(len(cutoff[c])): cluster = [] for x in range(len(cutoff[c]) - 1, -1, -1): if cutoff[c][x] is None or cutoff[c][i] is None: continue if abs(cutoff[c][i] - cutoff[c][x]) <= cluster_boundary: cluster.append(cluster_dict_arrays[c][x]) cutoff[c][x] = None cluster_dict_arrays[c][x] = None if cluster is []: continue clusters.append(cluster) # Get the m vvcaximum similarity word vector value for each cluster, across all clusters # For each cluster explained_cutoff = [] explained_cutoff_value = [] for cl in range(len(clusters)): if len(clusters[cl]) == 0: print("Skipped") continue cluster_explanation, winning_index = webapi.getHighestScore( clusters[cl]) explained_cutoff.append(cluster_explanation + ",") dict_index = 0 for h in range(len(cluster_dict_arrays[cl])): if cluster_dict_arrays[cl][h] == clusters[cl][winning_index]: dict_index = h explained_cutoff_value.append(dupe_cutoff[cl][dict_index]) explanations.append(explained_cutoff) explanation_cutoffs.append(explained_cutoff_value) dt.write2dArray( explanations, "../data/movies/rules/final_names/" + file_name + "WVN.txt") dt.write2dArray(explanation_cutoffs, "../data/movies/rules/final_cutoff/" + file_name + ".txt")
def saveClusters(directions_fn, scores_fn, names_fn, filename, amt_of_dirs, data_type, cluster_amt, rewrite_files=False, algorithm="meanshift_k"): dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt" cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt" all_fns = [dict_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", saveClusters.__name__) return else: print("Running task", saveClusters.__name__) p_dir = dt.import2dArray(directions_fn) p_names = dt.import1dArray(names_fn, "s") p_scores = dt.import1dArray(scores_fn, "f") ids = np.argsort(p_scores) p_dir = np.flipud(p_dir[ids])[:amt_of_dirs] p_names = np.flipud(p_names[ids])[:amt_of_dirs] if algorithm == "meanshift": labels = meanShift(p_dir) else: labels = kMeans(p_dir, cluster_amt) unique, counts = np.unique(labels, return_counts=True) clusters = [] dir_clusters = [] for i in range(len(unique)): clusters.append([]) dir_clusters.append([]) for i in range(len(labels)): clusters[labels[i]].append(p_names[i]) dir_clusters[labels[i]].append(p_dir[i]) cluster_directions = [] for l in range(len(dir_clusters)): cluster_directions.append(dt.mean_of_array(dir_clusters[l])) print("------------------------") for c in clusters: print(c) print("------------------------") dt.write2dArray(clusters, dict_fn) dt.write2dArray(cluster_directions, cluster_directions_fn)
def writeFromMultiClass(multi_class_fn, output_folder, entity_names_fn, data_type, classify_name): # Get the entities we have phrases for entity_names = dt.import1dArray(entity_names_fn) # Import multi classes multi_class = dt.import1dArray(multi_class_fn) class_names = [] class_val = [] highest_class = 0 for line in multi_class: cn, cv = re.split(r'\t+', line) cv = int(cv) class_names.append(cn) class_val.append(cv) if cv > highest_class: highest_class = cv matched_entity_names = list(set(entity_names).intersection(class_names)) matched_entity_names.sort() dt.write1dArray(matched_entity_names, "../data/" + data_type + "/classify/"+classify_name+"/available_entities.txt") indexes_to_delete = [] for n in range(len(class_names)): found = False for en in range(len(matched_entity_names)): if class_names[n] == matched_entity_names[en]: found=True break if found is False: indexes_to_delete.append(n) class_val = np.delete(class_val, indexes_to_delete) classes = [] print("Found " + str(highest_class) + " classes") for e in range(len(matched_entity_names)): class_a = [0] * highest_class class_a[class_val[e]-1] = 1 classes.append(class_a) dt.write2dArray(classes, "../data/"+data_type+"/classify/"+classify_name+"/class-all") print("Wrote class all") classes = np.asarray(classes).transpose() for cn in range(len(classes)): dt.write1dArray(classes[cn], "../data/"+data_type+"/classify/"+classify_name+"/class-"+str(cn)) print("Wrote", "class-"+str(cn))
def randomAll(cluster_names_fn, fn): all_cluster_output = [] cluster_names = dt.import1dArray(cluster_names_fn) for cn in cluster_names: binary = np.asarray( dt.import1dArray( "../data/movies/bow/frequency/phrases/class-" + cn, "f")) random_binary = [] for b in binary: random_binary.append(randint(0, np.amax(binary))) all_cluster_output.append(random_binary) dt.write2dArray(all_cluster_output, "../data/movies/finetune/" + fn + "RandomAll.txt")
def trimRankings(rankings_fn, available_indexes_fn, names, folder_name): available_indexes = dt.import1dArray(available_indexes_fn) rankings = np.asarray(dt.import2dArray(rankings_fn)) names = dt.import1dArray(names) trimmed_rankings = [] for r in range(len(rankings)): trimmed = rankings[r].take(available_indexes) trimmed_rankings.append(trimmed) for a in range(len(trimmed_rankings)): print("Writing", names[a]) dt.write1dArray(trimmed_rankings[a], folder_name + "class-" + names[a]) print("Writing", rankings_fn[-6:]) dt.write2dArray(trimmed_rankings, folder_name + "class-" + rankings_fn[-6:])
def makeTopVectorsDirections(filename): vectors = dt.import2dArray("Directions/" + filename + "Cut.directions") top250names = dt.import1dArray("filmdata/top250.txt") filmnames = dt.import1dArray("filmdata/filmNames.txt") top250vectors = [] for f in range(len(filmnames)): for t in range(len(top250names)): if filmnames[f] == top250names[t]: top250vectors.append(vectors[t]) dt.write2dArray(top250vectors, "../data/movies/plot/t250" + filename + ".directions")
def avgPPMI(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, classification="genres", lowest_amt=0, highest_amt=2147000000, limit_entities=False, save_results_so_far=False): pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist( all_fns) and not rewrite_files or save_results_so_far: print("Skipping task", avgPPMI.__name__) return else: print("Running task", avgPPMI.__name__) if limit_entities is False: classification = "all" ranking = dt.import2dArray(ranking_fn) names = dt.import2dArray(cluster_names_fn, "s") for n in range(len(names)): for x in range(len(names[n])): if ":" in names[n][x]: names[n][x] = names[n][x][:-1] frq = [] counter = 0 for n in range(len(names)): name_frq = [] for name in names[n]: name_frq.append( readPPMI(name, data_type, lowest_amt, highest_amt, classification)) avg_frq = [] name_frq = np.asarray(name_frq).transpose() for name in name_frq: avg_frq.append(np.average(name)) frq.append(np.asarray(avg_frq)) print(n) dt.write2dArray(frq, pavPPMI_fn) return frq
def binaryInCluster(cluster_dict_fn, fn): cluster = dt.readArrayDict(cluster_dict_fn) all_cluster_output = [] for key, items in cluster.items(): init_binary = dt.import1dArray( "../data/movies/bow/binary/phrases/" + key, "i") for i in items: binary = dt.import1dArray("../data/movies/bow/binary/phrases/" + i, "i") for j in range(len(init_binary)): if binary[j] == 1: init_binary[j] = 1 all_cluster_output.append(init_binary) dt.write2dArray(all_cluster_output, "../data/movies/finetune/" + fn + "InCluster.txt")
def main(data_type, clf, highest_amt, lowest_amt, depth, rewrite_files): min = lowest_amt max = highest_amt dm_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dm" dm_shorten_fn = "../data/" + data_type + "/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "dmround" mds_fn = "../data/"+data_type+"/mds/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "d" + str(depth) svd_fn = "../data/"+data_type+"/svd/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) pca_fn = "../data/"+data_type+"/pca/class-all-" + str(min) + "-" + str(max) \ + "-" + clf + "d" + str(depth) shorten_fn = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \ + "-" + clf+ "round" term_frequency_fn = init_vector_path = "../data/" + data_type + "/bow/ppmi/class-all-" + str(min) + "-" + str(max) \ + "-" + clf if dt.allFnsAlreadyExist([dm_fn, mds_fn, svd_fn, shorten_fn]): print("all files exist") exit() if dt.fileExists(dm_fn) is False: newsgroups_train = fetch_20newsgroups(subset='train', shuffle=False) newsgroups_test = fetch_20newsgroups(subset='test', shuffle=False) vectors = np.concatenate((newsgroups_train.data, newsgroups_test.data), axis=0) newsgroups_test = None newsgroups_train = None # Get sparse tf rep tf_vectorizer = CountVectorizer(max_df=highest_amt, min_df=lowest_amt, stop_words='english') print("completed vectorizer") tf = tf_vectorizer.fit_transform(vectors) vectors = None # Get sparse PPMI rep from sparse tf rep print("done ppmisaprse") sparse_ppmi = convertPPMISparse(tf) # Get sparse Dsim matrix from sparse PPMI rep dm = getDissimilarityMatrixSparse(sparse_ppmi) dt.write2dArray(dm, dm_fn) else: dm = dt.import2dArray(dm_fn) print("starting mds") # Use as input to mds mds = createMDS(dm, depth) # save MDS dt.write2dArray(mds, mds_fn)
def bagOfClustersPavPPMI(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000, sparse_freqs_fn=None, bow_names_fn=None): pavPPMI_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", bagOfClustersPavPPMI.__name__) return else: print("Running task", bagOfClustersPavPPMI.__name__) if limit_entities is False: classification = "all" bow_names = dt.import1dArray(bow_names_fn, "s") sparse_freqs = dt.import2dArray(sparse_freqs_fn, return_sparse=True) ranking = dt.import2dArray(ranking_fn) cluster_names = dt.import2dArray(cluster_names_fn, "s") frq = getLROnBag(cluster_names, data_type, lowest_amt, highest_amt, classification, file_name, bow_names, sparse_freqs) pav_classes = [] for f in range(len(frq)): print(cluster_names[f]) x = np.asarray(frq[f]) y = ranking[f] ir = IsotonicRegression() y_ = ir.fit_transform(x, y) pav_classes.append(y_) if do_p: plot(x, y, y_) print(f) dt.write2dArray(pav_classes, pavPPMI_fn) return pav_classes
def maxNonZero(cluster_names_fn, fn): all_cluster_output = [] cluster_names = dt.import1dArray(cluster_names_fn) for cn in cluster_names: binary = np.asarray( dt.import1dArray( "../data/movies/bow/frequency/phrases/class-" + cn, "f")) random_binary = [] for b in binary: if b > 0: random_binary.append(np.amax(binary)) else: random_binary.append(0) all_cluster_output.append(random_binary) dt.write2dArray(all_cluster_output, "../data/movies/finetune/" + fn + "MaxNonZero.txt")
def getDissimilarityMatrixSparse(tf): tflen = tf.shape[0] dm = np.empty([tflen, tflen], dtype="float64") pithing = 2/pi norms = np.empty(tflen, dtype="float64") #Calculate norms for ei in range(tflen): norms[ei] = spl.norm(tf[ei]) print("norm", ei) dot_product = np.zeros([tflen, tflen], dtype="float64") use_old_dp = True if use_old_dp: dot_product = dt.import2dArray("dotproduct.temp") else: #Calculate dot products for ei in range(tflen): for ej in range(tflen): if dot_product[ej][ei] != 0: dot_product[ei][ej] = dot_product[ej][ei] continue dot_product[ei][ej] = tf[ei].dot(tf[ej].T)[0,0] print("dp", ei) dt.write2dArray(dot_product, "dotproduct.temp") norm_multiplied = np.empty([tflen, tflen], dtype="float64") # Calculate dot products for ei in range(tflen): for ej in range(tflen): norm_multiplied[ei][ej] = norms[ei] * norms[ej] print("dp", ei) norm_multiplied = dt.shortenFloatsNoFn(norm_multiplied) dot_product = dt.shortenFloatsNoFn(dot_product) #Get angular differences for ei in range(tflen): for ej in range(tflen): ang = pithing * np.arccos(dot_product[ei][ej] / norm_multiplied[ei][ej]) dm[ei][ej] = ang print(ei) return dm
def main(min, max, data_type, raw_fn, extension, cut_first_line, additional_name, make_individual, entity_name_fn, use_all_files, sparse_matrix, word_count_amt, classification): getVectors(raw_fn, entity_name_fn, extension, "../data/"+data_type+"/bow/", min, max, cut_first_line, get_all, additional_name, make_individual, classification, use_all_files, 1000, data_type, sparse_matrix) bow = sp.csr_matrix(dt.import2dArray("../data/"+data_type+"/bow/frequency/phrases/class-all-"+str(min)+"-" + str(max)+"-"+classification)) dt.write2dArray(convertPPMI( bow), "../data/"+data_type+"/bow/ppmi/class-all-"+str(min)+"-"+str(max)+"-" + classification) print("indiviual from all") printIndividualFromAll(data_type, "ppmi", min, max, classification) printIndividualFromAll(data_type, "binary/phrases", min, max, classification) convertToTfIDF(data_type, min, max, "../data/"+data_type+"/bow/frequency/phrases/class-all-"+str(min)+"-"+str(max)+"-"+classification, classification) printIndividualFromAll(data_type, "tfidf", min, max, classification)
def getAllPhraseRankings(directions_fn=None, vectors_fn=None, property_names_fn=None, vector_names_fn=None, fn="no filename", percentage_increment=1, scores_fn=None, top_amt=0, discrete=False, data_type="movies", rewrite_files=False): rankings_fn_all = "../data/" + data_type + "/rank/numeric/" + fn + "ALL.txt" all_fns = [rankings_fn_all] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getAllPhraseRankings") return else: print("Running task", "getAllPhraseRankings") directions = dt.import2dArray(directions_fn) vectors = dt.import2dArray(vectors_fn) property_names = dt.import1dArray(property_names_fn) vector_names = dt.import1dArray(vector_names_fn) if top_amt != 0: scores = dt.import1dArray(scores_fn, "f") directions = dt.sortByReverseArray(directions, scores)[:top_amt] property_names = dt.sortByReverseArray(property_names, scores)[:top_amt] rankings = getRankings(directions, vectors, property_names, vector_names) if discrete: discrete_labels = createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) for a in range(len(rankings)): rankings[a] = np.around(rankings[a], decimals=4) #dt.write1dArray(property_names, "../data/movies/bow/names/top5kof17k.txt") dt.write2dArray(rankings, rankings_fn_all)
def getCutOff(cluster_dict_fn, rankings_fn, file_name): cluster_dict = dt.readArrayDict(cluster_dict_fn) rankings = dt.importDiscreteVectors(rankings_fn) for r in rankings: for a in range(len(r)): r[a] = int(r[a][:-1]) cutoff_clusters = [] counter = 0 for key, value in cluster_dict.items(): value.insert(0, key) cutoffs = [] for v in value: max_score = 0 cutoff = 0 for i in range(1, 101): y_pred = [] for ve in range(len(rankings[counter])): rank = rankings[counter][ve] if rank > i: y_pred.append(0) else: y_pred.append(1) y_test = dt.import2dArray( "../data/movies/bow/frequency/phrases/class-" + v, "s") score = cohen_kappa_score(y_test, y_pred) print(v, int(i), "Score", score) if score > max_score: max_score = score cutoff = i cutoffs.append(cutoff) print("Cutoff for", v, "On", key, "Was", str(cutoff)) cutoff_clusters.append(cutoffs) counter += 1 dt.write2dArray(cutoff_clusters, "../data/movies/rules/cutoff/" + file_name + ".txt")