def match_entities(entity_fn, t_entity_fn, entities_fn, classification): names = dt.import1dArray(entity_fn) t_names = dt.import1dArray(t_entity_fn) entities = dt.import2dArray(entities_fn) indexes_to_delete = [] amount_found = 0 for n in range(len(names)): names[n] = dt.removeEverythingFromString(names[n]) for n in range(len(t_names)): t_names[n] = dt.removeEverythingFromString(t_names[n]) matched_ids = [] for n in range(len(t_names)): for ni in range(len(names)): matched_name = t_names[n] all_name = names[ni] if matched_name == all_name: print(matched_name) matched_ids.append(ni) break matched_entities = [] for e in matched_ids: matched_entities.append(entities[e]) print("Amount found", amount_found) dt.write2dArray(matched_entities, entities_fn[:len(entities_fn)-4] + "-" + classification + ".txt")
def plotClusters(filename): names = dt.import1dArray("Plots/Top174OrderedByOriginalList.txt") space = dt.import2dArray("Plots/Top174" + filename + ".space") cluster_names = dt.import1dArray( "Clusters/films100N0.6H25L3CutLeastSimilarHIGH0.75,0.67.names") #svd = TruncatedSVD(n_components=2, random_state=42) cx = 8 cy = 9 x = [] y = [] for s in space[cx]: x.append(s) for s in space[cy]: y.append(s) #svd_space = svd.fit_transform(space) fig, ax = plt.subplots() ax.scatter(x, y, picker=True) #for i, name in enumerate(found_names): # ax.annotate(name, (x[i], y[i])) ax.set_xlabel(cluster_names[cx]) ax.set_ylabel(cluster_names[cy]) def onpick3(event): ind = event.ind print('onpick3 scatter:', names[ind[0]]) fig.canvas.mpl_connect('pick_event', onpick3) plt.show()
def makeTopVectors(filename): vectors = dt.import2dArray("Rankings/" + filename + ".space") top250names = dt.import1dArray("filmdata/top250.txt") film_names = dt.import1dArray("filmdata/filmNames.txt") indexes = [] ordered_names = [] for f in range(len(film_names)): for t in top250names: if film_names[f] == t: indexes.append(f) ordered_names.append(t) top_vectors = [[]] for v in range(len(vectors)): if v > 0: top_vectors.append([]) for i in range(len(vectors[v])): for id in indexes: if i == id: top_vectors[v].append(vectors[v][i]) dt.write2dArray(top_vectors, "Plots/Top174" + filename + ".space") dt.write1dArray(ordered_names, "Plots/Top174OrderedByOriginalList.txt")
def binaryClusterTerm(cluster_names_fn, fn): all_cluster_output = [] cluster_names = dt.import1dArray(cluster_names_fn) for cn in cluster_names: binary = dt.import1dArray( "../data/movies/bow/binary/phrases/class-" + cn, "i") all_cluster_output.append(binary) dt.write2dArray(all_cluster_output, "../data/movies/finetune/" + fn + "ClusterTerm.txt")
def PPMI(cluster_names_fn, fn): all_cluster_output = [] cluster_names = dt.import1dArray(cluster_names_fn) for cn in cluster_names: binary = dt.import1dArray("../data/movies/bow/ppmi/class-class-" + cn, "f") all_cluster_output.append(binary) dt.write2dArray(all_cluster_output, "../data/movies/finetune/" + fn + "PPMI.txt")
def normalizedTermFrequency(cluster_names_fn, fn): all_cluster_output = [] cluster_names = dt.import1dArray(cluster_names_fn) for cn in cluster_names: binary = dt.import1dArray( "../data/movies/bow/frequency/phrases/class-" + cn, "i") all_cluster_output.append(binary) new_output = dt.scaleSpaceUnitVector( all_cluster_output, "../data/movies/finetune/" + fn + "NormalizedTermFrequency.txt")
def convertEntityNamesToIDS(ID_fn, all_names_fn, individual_names_fn, output_fn): ID_fn = dt.import1dArray(ID_fn) all_names_fn = dt.import1dArray(all_names_fn) individual_names_fn = dt.import1dArray(individual_names_fn) indexes = [] for n in range(len(all_names_fn)): for name in individual_names_fn: if all_names_fn[n] == name: indexes.append(n) dt.write1dArray(np.asarray(ID_fn)[indexes], output_fn)
def saveClusters(directions_fn, scores_fn, names_fn, filename, amt_of_dirs, data_type, cluster_amt, rewrite_files=False, algorithm="meanshift_k"): dict_fn = "../data/" + data_type + "/cluster/dict/" + filename + ".txt" cluster_directions_fn = "../data/" + data_type + "/cluster/clusters/" + filename + ".txt" all_fns = [dict_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", saveClusters.__name__) return else: print("Running task", saveClusters.__name__) p_dir = dt.import2dArray(directions_fn) p_names = dt.import1dArray(names_fn, "s") p_scores = dt.import1dArray(scores_fn, "f") ids = np.argsort(p_scores) p_dir = np.flipud(p_dir[ids])[:amt_of_dirs] p_names = np.flipud(p_names[ids])[:amt_of_dirs] if algorithm == "meanshift": labels = meanShift(p_dir) else: labels = kMeans(p_dir, cluster_amt) unique, counts = np.unique(labels, return_counts=True) clusters = [] dir_clusters = [] for i in range(len(unique)): clusters.append([]) dir_clusters.append([]) for i in range(len(labels)): clusters[labels[i]].append(p_names[i]) dir_clusters[labels[i]].append(p_dir[i]) cluster_directions = [] for l in range(len(dir_clusters)): cluster_directions.append(dt.mean_of_array(dir_clusters[l])) print("------------------------") for c in clusters: print(c) print("------------------------") dt.write2dArray(clusters, dict_fn) dt.write2dArray(cluster_directions, cluster_directions_fn)
def writeFromMultiClass(multi_class_fn, output_folder, entity_names_fn, data_type, classify_name): # Get the entities we have phrases for entity_names = dt.import1dArray(entity_names_fn) # Import multi classes multi_class = dt.import1dArray(multi_class_fn) class_names = [] class_val = [] highest_class = 0 for line in multi_class: cn, cv = re.split(r'\t+', line) cv = int(cv) class_names.append(cn) class_val.append(cv) if cv > highest_class: highest_class = cv matched_entity_names = list(set(entity_names).intersection(class_names)) matched_entity_names.sort() dt.write1dArray(matched_entity_names, "../data/" + data_type + "/classify/"+classify_name+"/available_entities.txt") indexes_to_delete = [] for n in range(len(class_names)): found = False for en in range(len(matched_entity_names)): if class_names[n] == matched_entity_names[en]: found=True break if found is False: indexes_to_delete.append(n) class_val = np.delete(class_val, indexes_to_delete) classes = [] print("Found " + str(highest_class) + " classes") for e in range(len(matched_entity_names)): class_a = [0] * highest_class class_a[class_val[e]-1] = 1 classes.append(class_a) dt.write2dArray(classes, "../data/"+data_type+"/classify/"+classify_name+"/class-all") print("Wrote class all") classes = np.asarray(classes).transpose() for cn in range(len(classes)): dt.write1dArray(classes[cn], "../data/"+data_type+"/classify/"+classify_name+"/class-"+str(cn)) print("Wrote", "class-"+str(cn))
def trimRankings(rankings_fn, available_indexes_fn, names, folder_name): available_indexes = dt.import1dArray(available_indexes_fn) rankings = np.asarray(dt.import2dArray(rankings_fn)) names = dt.import1dArray(names) trimmed_rankings = [] for r in range(len(rankings)): trimmed = rankings[r].take(available_indexes) trimmed_rankings.append(trimmed) for a in range(len(trimmed_rankings)): print("Writing", names[a]) dt.write1dArray(trimmed_rankings[a], folder_name + "class-" + names[a]) print("Writing", rankings_fn[-6:]) dt.write2dArray(trimmed_rankings, folder_name + "class-" + rankings_fn[-6:])
def randomAll(cluster_names_fn, fn): all_cluster_output = [] cluster_names = dt.import1dArray(cluster_names_fn) for cn in cluster_names: binary = np.asarray( dt.import1dArray( "../data/movies/bow/frequency/phrases/class-" + cn, "f")) random_binary = [] for b in binary: random_binary.append(randint(0, np.amax(binary))) all_cluster_output.append(random_binary) dt.write2dArray(all_cluster_output, "../data/movies/finetune/" + fn + "RandomAll.txt")
def makeTopVectorsDirections(filename): vectors = dt.import2dArray("Directions/" + filename + "Cut.directions") top250names = dt.import1dArray("filmdata/top250.txt") filmnames = dt.import1dArray("filmdata/filmNames.txt") top250vectors = [] for f in range(len(filmnames)): for t in range(len(top250names)): if filmnames[f] == top250names[t]: top250vectors.append(vectors[t]) dt.write2dArray(top250vectors, "../data/movies/plot/t250" + filename + ".directions")
def binaryInCluster(cluster_dict_fn, fn): cluster = dt.readArrayDict(cluster_dict_fn) all_cluster_output = [] for key, items in cluster.items(): init_binary = dt.import1dArray( "../data/movies/bow/binary/phrases/" + key, "i") for i in items: binary = dt.import1dArray("../data/movies/bow/binary/phrases/" + i, "i") for j in range(len(init_binary)): if binary[j] == 1: init_binary[j] = 1 all_cluster_output.append(init_binary) dt.write2dArray(all_cluster_output, "../data/movies/finetune/" + fn + "InCluster.txt")
def logisticRegression(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000, sparse_freqs_fn=None, bow_names_fn=None): lr_fn = "../data/" + data_type + "/finetune/boc/" + file_name + ".txt" all_fns = [lr_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", bagOfClusters.__name__) return else: print("Running task", bagOfClusters.__name__) if limit_entities is False: classification = "all" cluster_names = dt.import2dArray(cluster_names_fn, "s") bow_names = dt.import1dArray(bow_names_fn, "s") sparse_freqs = dt.import2dArray(sparse_freqs_fn, return_sparse=True) frq = getLROnBag(cluster_names, data_type, lowest_amt, highest_amt, classification, file_name, bow_names, sparse_freqs) dt.write2dArray(frq, lr_fn) return frq
def maxNonZero(cluster_names_fn, fn): all_cluster_output = [] cluster_names = dt.import1dArray(cluster_names_fn) for cn in cluster_names: binary = np.asarray( dt.import1dArray( "../data/movies/bow/frequency/phrases/class-" + cn, "f")) random_binary = [] for b in binary: if b > 0: random_binary.append(np.amax(binary)) else: random_binary.append(0) all_cluster_output.append(random_binary) dt.write2dArray(all_cluster_output, "../data/movies/finetune/" + fn + "MaxNonZero.txt")
def plotSVD(filename): names = dt.import1dArray("Plots/Top174OrderedByOriginalList.txt") space = dt.import2dArray("Plots/Top174" + filename + ".space") space = np.matrix.transpose(np.asarray(space)) space = space.tolist() svd = TruncatedSVD(n_components=2, random_state=42) svd_space = svd.fit_transform(space) x = [] y = [] for s in svd_space: print(s) x.append(s[0]) y.append(s[1]) fig, ax = plt.subplots() ax.scatter(x, y, picker=True) # for i, name in enumerate(found_names): # ax.annotate(name, (x[i], y[i])) def onpick3(event): ind = event.ind print('onpick3 scatter:', names[ind[0]]) fig.canvas.mpl_connect('pick_event', onpick3) plt.show()
def pavTermFrequency(ranking_fn, cluster_names_fn, fn, plot): ranking = dt.import2dArray(ranking_fn) names = dt.import1dArray(cluster_names_fn) frq = [] counter = 0 for name in names: frq.append(readFreq(name)) pav_classes = [] for f in range(len(frq)): print(names[f]) x = np.asarray(frq[f]) y = ranking[f] ir = IsotonicRegression() y_ = ir.fit_transform(x, y) pav_classes.append(y_) if plot: plot(x, y, y_) print(f) dt.write2dArray( pav_classes, "../data/movies/finetune/" + file_name + "PavTermFrequency.txt") return pav_classes
def PPMIFT(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000): pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", pavPPMI.__name__) return else: print("Running task", pavPPMI.__name__) print("certainly still running that old pavPPMI task, yes sir") if limit_entities is False: classification = "all" ranking = dt.import2dArray(ranking_fn) names = dt.import1dArray(cluster_names_fn) frq = [] counter = 0 for name in names: name = name.split()[0] if ":" in name: name = name[:-1] frq.append( readPPMI(name, data_type, lowest_amt, highest_amt, classification)) dt.write2dArray(frq, pavPPMI_fn) return frq
def runLR(self, property_name, y=None): if y is None: y = dt.import1dArray( "../data/" + self.data_type + "/bow/binary/phrases/class-" + property_name + "-" + str(self.lowest_amt) + "-" + str(self.higher_amt) + "-" + self.classification, file_type="i") else: y = y[0] for i in range(len(y)): if y[i] >= 1: y[i] = 1 #x_train, y_train = dt.balanceClasses(x_train, y_train) clf = linear_model.LogisticRegression(class_weight="balanced", dual=False) clf.fit(self.x_train, y) direction = clf.coef_.tolist()[0] y_pred = clf.predict(self.x_test) y_pred = y_pred.tolist() f1 = f1_score(y[:len(y_pred)], y_pred) kappa_score = cohen_kappa_score(y[:len(y_pred)], y_pred) acc = accuracy_score(y[:len(y_pred)], y_pred) TP, FP, TN, FN = self.perf_measure(y, y_pred) #ppmi_score, ppmi_ratio = get_ppmi_score(y_pred, property_name) return kappa_score, f1, direction, acc, 0, TP, FP, TN, FN
def runSVM(self, property_name, y=None): if y is None: y = dt.import1dArray( "../data/" + self.data_type + "/bow/binary/phrases/class-" + property_name + "-" + str(self.lowest_amt) + "-" + str(self.higher_amt) + "-" + self.classification, file_type="i") else: y = y[0] for i in range(len(y)): if y[i] >= 1: y[i] = 1 #x_train, y_train = dt.balanceClasses(x_train, y_train) clf = svm.LinearSVC(class_weight="balanced", dual=False) #if len(self.x_train) != clf.fit(self.x_train, y) direction = clf.coef_.tolist()[0] y_pred = clf.predict(self.x_test) y_pred = y_pred.tolist() f1 = f1_score(y[:len(y_pred)], y_pred) kappa_score = cohen_kappa_score(y[:len(y_pred)], y_pred) acc = accuracy_score(y[:len(y_pred)], y_pred) TP, FP, TN, FN = self.perf_measure(y, y_pred) print("TP", TP, "FP", FP, "TN", TN, "FN", FN) return kappa_score, f1, direction, acc, 0, TP, FP, TN, FN
def getAllRankings(directions_fn, vectors_fn, cluster_names_fn, vector_names_fn, percent, percentage_increment, by_vector, fn, discrete=True, data_type="movies", rewrite_files=False): #labels_fn = "../data/"+data_type+"/rank/labels/" + fn + ".txt" rankings_fn = "../data/" + data_type + "/rank/numeric/" + fn + ".txt" #discrete_labels_fn = "../data/"+data_type+"/rank/discrete/" + fn + ".txt" all_fns = [rankings_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: for f in all_fns: print(f, "Already exists") print("Skipping task", "getAllRankings") return else: print("Running task", "getAllRankings") directions = dt.import2dArray(directions_fn) vectors = dt.import2dArray(vectors_fn) cluster_names = dt.import1dArray(cluster_names_fn) vector_names = dt.import1dArray(vector_names_fn) rankings = getRankings(directions, vectors, cluster_names, vector_names) rankings = np.asarray(rankings) if discrete: labels = createLabels(rankings, percent) labels = np.asarray(labels) discrete_labels = createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) if by_vector: labels = labels.transpose() if discrete: discrete_labels = discrete_labels.transpose() rankings = rankings.transpose() if discrete: dt.write2dArray(labels, labels_fn) dt.write2dArray(rankings, rankings_fn) if discrete: dt.write2dArray(discrete_labels, discrete_labels_fn)
def fixCutoffFormatting(cutoff_fn, file_name): cutoff = dt.import1dArray(cutoff_fn) cluster_dict = dt.readArrayDict(cluster_dict_fn) for c in range(len(cutoff)): cutoff[c] = cutoff[c].split() for i in range(len(cutoff[c])): cutoff[c][i] = int(dt.stripPunctuation(cutoff[c][i])) dt.write2dArray(cutoff, "../data/movies/rules/cutoff/" + file_name + ".txt")
def main(data_type, vector_size, window_size, min_count, sampling_threshold, negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax): file_name = "Doc2Vec" + " VS" + str(vector_size) + " WS" + str(window_size) + " MC" + str(min_count) + " ST" + str( sampling_threshold) + \ " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str( worker_count) + "spacy" " NS" + str(negative_size) + " TE" + str(train_epoch) + " DM" + str(dm) + " WC" + str(worker_count) + \ " TW" + str(train_wv) + " CW" + str(concatenate_wv) + " HS" + str(use_hierarchical_softmax) corpus_fn = "../data/raw/" + data_type + "/corpus_processed.txt" if os.path.exists(corpus_fn) is False: x_train = np.load("../data/raw/" + data_type + "/x_train_w.npy") x_test = np.load("../data/raw/" + data_type + "/x_test_w.npy") corpus = np.concatenate((x_train, x_test), axis=0) text_corpus = np.empty(len(corpus), dtype=np.object) for i in range(len(corpus)): text_corpus[i] = " ".join(corpus[i]) print(text_corpus[i]) dt.write1dArray(text_corpus, corpus_fn) embedding_fn = "/home/tom/Downloads/glove.6B/glove.6B.300d.txt" model_fn = "../data/" + data_type + "/doc2vec/" + file_name + ".bin" vector_fn = "../data/" + data_type + "/nnet/spaces/" + file_name + ".npy" score_fn = "../data/" + data_type + "/doc2vec/" + file_name + "catacc.score" if os.path.exists(model_fn): print("Imported model") model = g.utils.SaveLoad.load(model_fn) elif file_name[:7] == "Doc2Vec": model = doc2Vec(embedding_fn, corpus_fn, vector_size, window_size, min_count, sampling_threshold, negative_size, train_epoch, dm, worker_count, train_wv, concatenate_wv, use_hierarchical_softmax) model.save(model_fn) if os.path.exists(vector_fn) is False: vectors = [] for d in range(len(model.docvecs)): vectors.append(model.docvecs[d]) np.save(vector_fn, vectors) else: print("Imported vectors") vectors = np.load(vector_fn) if os.path.exists(score_fn) is False or file_name[:6] != "Doc2Vec": print("Getting score") if data_type == "sentiment": classes = dt.import1dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i") x_train, y_train, x_test, y_test = sentiment.getSplits(vectors, classes) scores = linearSVMScore(x_train, y_train, x_test, y_test) else: classes = dt.import2dArray("../data/" + data_type + "/classify/" + data_type + "/class-all", "i") x_train, y_train, x_test, y_test = newsgroups.getSplits(vectors, classes) scores = multiClassLinearSVM(x_train, y_train, x_test, y_test) print(scores) dt.write1dArray(scores, score_fn)
def obtainNDCG(self): # For each discrete rank, obtain the Kappa score compared to the word occ ndcgs = np.empty(len(self.names)) for n in range(len(self.names)): ppmi = np.asarray(dt.import1dArray("../data/" + self.data_type + "/bow/ppmi/class-" + self.names[n] + "-" + str(self.lowest_amt) + "-" + str(self.highest_amt) + "-" + str( self.classification), "f")) sorted_indices = np.argsort(self.ranks)[::-1] score = ndcg.ndcg_from_ranking(ppmi, sorted_indices) ndcgs[n] = score return ndcgs
def pavPPMI(cluster_names_fn, ranking_fn, file_name, do_p=False, data_type="movies", rewrite_files=False, limit_entities=False, classification="genres", lowest_amt=0, highest_amt=2147000000): pavPPMI_fn = "../data/" + data_type + "/finetune/" + file_name + ".txt" all_fns = [pavPPMI_fn] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", pavPPMI.__name__) return else: print("Running task", pavPPMI.__name__) print("certainly still running that old pavPPMI task, yes sir") if limit_entities is False: classification = "all" ranking = dt.import2dArray(ranking_fn) names = dt.import1dArray(cluster_names_fn) frq = [] counter = 0 for name in names: name = name.split()[0] if ":" in name: name = name[:-1] frq.append( readPPMI(name, data_type, lowest_amt, highest_amt, classification)) pav_classes = [] for f in range(len(frq)): try: print(names[f]) x = np.asarray(frq[f]) y = ranking[f] ir = IsotonicRegression() y_ = ir.fit_transform(x, y) pav_classes.append(y_) if do_p: plot(x, y, y_) except ValueError: print(names[f], "len ppmi", len(frq[f], "len ranking", len(ranking[f]))) exit() print(f) dt.write2dArray(pav_classes, pavPPMI_fn) return pav_classes
def getAvailableEntities(entity_names_fns, data_type, classification): entity_names = [] for e in entity_names_fns: entity_names.append(dt.import1dArray(e)) dict = {} for entity_name in entity_names: for name in entity_name: dict[name] = 0 available_entities = [] for key in dict: available_entities.append(key) dt.write1dArray(available_entities, "../data/"+data_type+"/classify/"+classification+"available_entities.txt")
def getAllPhraseRankings(directions_fn=None, vectors_fn=None, property_names_fn=None, vector_names_fn=None, fn="no filename", percentage_increment=1, scores_fn=None, top_amt=0, discrete=False, data_type="movies", rewrite_files=False): rankings_fn_all = "../data/" + data_type + "/rank/numeric/" + fn + "ALL.txt" all_fns = [rankings_fn_all] if dt.allFnsAlreadyExist(all_fns) and not rewrite_files: print("Skipping task", "getAllPhraseRankings") return else: print("Running task", "getAllPhraseRankings") directions = dt.import2dArray(directions_fn) vectors = dt.import2dArray(vectors_fn) property_names = dt.import1dArray(property_names_fn) vector_names = dt.import1dArray(vector_names_fn) if top_amt != 0: scores = dt.import1dArray(scores_fn, "f") directions = dt.sortByReverseArray(directions, scores)[:top_amt] property_names = dt.sortByReverseArray(property_names, scores)[:top_amt] rankings = getRankings(directions, vectors, property_names, vector_names) if discrete: discrete_labels = createDiscreteLabels(rankings, percentage_increment) discrete_labels = np.asarray(discrete_labels) for a in range(len(rankings)): rankings[a] = np.around(rankings[a], decimals=4) #dt.write1dArray(property_names, "../data/movies/bow/names/top5kof17k.txt") dt.write2dArray(rankings, rankings_fn_all)
def plotTopVectors(filename): names = dt.import1dArray( "../data/movies/plot/Top174OrderedByOriginalList.txt") space = dt.import2dArray("../data/movies/plot/Top174" + filename + ".space") svd = TruncatedSVD(n_components=2, random_state=42) svd_space = svd.fit_transform(space) pl.plot(space[0], 'rx') pl.show() """
def writeBagOfClusters(cluster_dict, data_type, lowest_amt, highest_amt, classification, fn): bag_of_clusters = [] # Note, prior we used the PPMI values directly here somehow... loc = "../data/" + data_type + "/bow/frequency/phrases/" final_fn = "" for c in range(len(cluster_dict)): # Remove the colons for f in range(len(cluster_dict[c])): if ":" in cluster_dict[c][f]: cluster_dict[c][f] = cluster_dict[c][f][:-1] # Add all of the frequences together to make a bag-of-clusters p1 = loc + "class-" + cluster_dict[c][0] p2 = "-" + str(lowest_amt) + "-" + str( highest_amt) + "-" + classification accum_freqs = [0.0] * len(dt.import1dArray(p1 + p2)) counter = 0 # For all the cluster terms for f in cluster_dict[c]: if ":" in f: f = f[:-1] # Import the class class_to_add = dt.import1dArray( loc + "class-" + f + "-" + str(lowest_amt) + "-" + str(highest_amt) + "-" + classification, "f") # Add the current class to the older one accum_freqs = np.add(accum_freqs, class_to_add) counter += 1 # Append this clusters frequences to the group of them bag_of_clusters.append(accum_freqs) # Obtain the PPMI values for these frequences ppmi_fn = "../data/" + data_type + "/bow/ppmi/" + "class-" + final_fn + str( lowest_amt) + "-" + str(highest_amt) + "-" + classification bag_csr = sp.csr_matrix(np.asarray(bag_of_clusters)) ppmi_csr = mt.convertPPMI(bag_csr) dt.write2dArray(ppmi_csr, "../data/" + data_type + "/bow/ppmi/" + fn + ".txt") return ppmi_csr
def obtainKappaOrNDCG(self): # For each discrete rank, obtain the Kappa score compared to the word occ scores = np.empty(len(self.names)) for n in range(len(self.names)): if self.types[n] == 0: clf = svm.LinearSVC() ppmi = np.asarray( dt.import1dArray("../data/" + self.data_type + "/bow/binary/phrases/class-" + self.names[n] + "-" + str(self.lowest_amt) + "-" + str(self.highest_amt) + "-" + str( self.classification), "f")) clf.fit(self.ranks, ppmi) y_pred = clf.predict(self.ranks) score = cohen_kappa_score(ppmi, y_pred) scores[n] = score else: ppmi = np.asarray( dt.import1dArray("../data/" + self.data_type + "/bow/ppmi/class-" + self.names[n] + "-" + str(self.lowest_amt) + "-" + str(self.highest_amt) + "-" + str( self.classification), "f")) sorted_indices = np.argsort(self.ranks)[::-1] score = ndcg.ndcg_from_ranking(ppmi, sorted_indices) scores[n] = score return scores