def getNamesFromDict(dict_fn, file_name): new_dict = import2dArray(dict_fn, "s") names = [] for d in range(len(new_dict)): names.append(new_dict[d][0].strip()) write1dArray( names, "../data/movies/cluster/hierarchy_names/" + file_name + ".txt")
def compileSVMResults(file_name, chunk_amt, data_type): if fileExists("../data/" + data_type + "/svm/directions/" + file_name + ".txt") is False: print("Compiling SVM results") randomcount = 0 directions = [] for c in range(chunk_amt): directions.append("../data/" + data_type + "/svm/directions/" + file_name + " CID" + str(c) + " CAMT" + str(chunk_amt) + ".txt") kappa = [] for c in range(chunk_amt): kappa.append("../data/" + data_type + "/svm/kappa/" + file_name + " CID" + str(c) + " CAMT" + str(chunk_amt) + ".txt") for f in directions: while not fileExists(f): time.sleep(10) time.sleep(10) di = [] for d in directions: di.extend(import2dArray(d)) ka = [] for k in kappa: ka.extend(import1dArray(k)) write2dArray( di, "../data/" + data_type + "/svm/directions/" + file_name + ".txt") write1dArray( ka, "../data/" + data_type + "/svm/kappa/" + file_name + ".txt") else: print("Skipping compile")
def getTop10Clusters(file_name, ids): clusters = np.asarray( import2dArray("../data/movies/rank/discrete/" + file_name + "P1.txt", "s")).transpose() cluster_names = import2dArray( "../data/movies/cluster/hierarchy_names/" + file_name + "0.8400.txt", "s") for c in range(len(cluster_names)): cluster_names[c] = cluster_names[c][0] to_get = [] for i in ids: for v in range(len(clusters[i])): rank = int(clusters[i][v][:-1]) if rank <= 3: print(cluster_names[v][6:]) print("----------------------")
def convertToPPMI(freq_arrays_fn, term_names_fn): freq_arrays = np.asarray(import2dArray(freq_arrays_fn, "i")) term_names = import1dArray(term_names_fn) ppmi_arrays = [] overall = 0.0 for f in freq_arrays: overall += sum(f) entity_array = [0] * 15000 # For each term for t in range(len(freq_arrays)): ppmi_array = [] term = sum(freq_arrays[t, :]) term_p = term / overall for e in range(len(freq_arrays[t])): ppmi = 0.0 freq = freq_arrays[t][e] if freq != 0: freq_p = freq / overall if entity_array[e] == 0: entity = sum(freq_arrays[:, e]) entity_p = entity / overall entity_array[e] = entity_p proba = freq_p / (entity_array[e] * term_p) ppmi = np.amax([0.0, np.log(proba)]) ppmi_array.append(ppmi) print(ppmi_array) ppmi_arrays.append(ppmi_array) write1dArray(ppmi_array, "../data/movies/bow/ppmi/class-" + term_names[t]) write2dArray(ppmi_arrays, "../data/movies/bow/ppmi/class-all")
def getWordVectors(vector_save_fn, words_fn, wvn, wv_amt, svm_dir_fn=None): if os.path.exists(vector_save_fn) is False: glove_file = datapath('/home/tom/Downloads/glove.6B/glove.6B.' + str(wv_amt) + 'd.txt') tmp_file = get_tmpfile( "/home/tom/Downloads/glove.6B/test_word2vec.txt") glove2word2vec(glove_file, tmp_file) svm_dir = import2dArray(svm_dir_fn) all_vectors = KeyedVectors.load_word2vec_format(tmp_file) vectors = [] words = import1dArray(words_fn) for w in range(len(words)): try: if svm_dir_fn is None: vectors.append(all_vectors.get_vector(words[w])) else: vectors.append( np.concatenate( [all_vectors.get_vector(words[w]), svm_dir[w]])) except KeyError: if svm_dir_fn is None: vectors.append(np.zeros(wv_amt)) else: vectors.append(np.zeros(wv_amt + len(svm_dir[0]))) write2dArray(vectors, vector_save_fn) write1dArray(words, wvn) else: print("Already got word vectors", vector_save_fn)
def countClassFrequences(data_type, class_name): class_all = import2dArray("../data/" + data_type + "/classify/" + class_name + "/class-all") class_names = import1dArray("../data/" + data_type + "/classify/" + class_name + "/names.txt") counts = [] class_all = np.asarray(class_all).transpose() for i in range(len(class_all)): count = len(np.nonzero(class_all[i])[0]) print(class_names[i], count) counts.append(count)
def writeClassAll(class_fn, full_phrases_fn, phrases_used_fn, file_name): full_phrases = import1dArray(full_phrases_fn) #ppmi = np.asarray(import2dArray(class_fn)).transpose() ppmi = import2dArray(class_fn) new_ppmi = [] phrases_used = import1dArray(phrases_used_fn) for p in range(len(full_phrases)): for pi in range(len(phrases_used)): if full_phrases[p] == phrases_used[pi]: new_ppmi.append(ppmi[p]) break write2dArray(new_ppmi, file_name)
def averageWordVectors(id2word, ppmi_fn, size, data_type): bow = import2dArray(ppmi_fn) if len(bow[0]) != len(id2word.keys()): print("vocab and bow dont match", len(bow[0]), len(id2word.keys())) exit() print("Creating dict") print("Importing word vectors") glove_file = datapath( "D:/Dropbox/PhD/My Work/Code/Paper 2/data/raw/glove/glove.6B." + str(size) + 'd.txt') tmp_file = get_tmpfile( "D:/Dropbox/PhD/My Work/Code/Paper 2/data/raw/glove/test_word2vec.txt") glove2word2vec(glove_file, tmp_file) all_vectors = KeyedVectors.load_word2vec_format(tmp_file) print("Creating vectors") vectors = [] i = 0 for doc in bow: to_average = [] for w in range(len(doc)): if doc[w] > 0: try: to_average.append( np.multiply(all_vectors.get_vector(id2word[w]), doc[w])) except KeyError: print("keyerror", id2word[w]) if len(to_average) == 0: to_average = [np.zeros(shape=size)] print("FAILED", i, "words:", len(to_average), "dim", len(to_average[0])) else: print(i, "words:", len(to_average), "dim", len(to_average[0])) vectors.append(np.average(to_average, axis=0)) i += 1 np.save( "../data/" + data_type + "/nnet/spaces/wvPPMIFIXED" + str(size) + ".npy", vectors)
def averageWordVectorsFreq(id2word, freq_fn, size, data_type): glove_file = datapath("D:\Downloads\Work/glove.6B/glove.6B." + str(size) + 'd.txt') tmp_file = get_tmpfile("D:\Downloads\Work/glove.6B/test_word2vec.txt") bow = import2dArray(freq_fn, "i") print("Transposing PPMI") bow = bow.transpose() if len(bow[0]) != len(id2word.keys()): print("vocab and bow dont match", len(bow[0]), len(id2word.keys())) exit() print("Creating dict") print("Importing word vectors") glove2word2vec(glove_file, tmp_file) all_vectors = KeyedVectors.load_word2vec_format(tmp_file) print("Creating vectors") vectors = [] i = 0 for doc in bow: to_average = [] for w in range(len(doc)): if doc[w] > 0: try: to_average.append(all_vectors.get_vector(id2word[w])) except KeyError: print("keyerror", id2word[w]) if len(to_average) == 0: to_average = [np.zeros(shape=size)] print("FAILED", i, "words:", len(to_average), "dim", len(to_average[0])) else: print(i, "words:", len(to_average), "dim", len(to_average[0])) vectors.append(np.average(to_average, axis=0)) i += 1 np.save( "../data/" + data_type + "/nnet/spaces/wvFIXED" + str(size) + ".npy", vectors)
def getNonZero(class_names_fn, file_name): class_names = import1dArray(class_names_fn, "s") class_all = np.asarray(import2dArray(file_name)).transpose() for c in range(len(class_all)): print(np.count_nonzero(class_all[c]))
def writeIndividualClasses(overall_class_fn, names_fn, output_filename): overall_class = import2dArray(overall_class_fn, "f") names = import1dArray(names_fn) for n in range(len(names)): write1dArray(overall_class[n], output_filename + "class-" + names[n]) print(names[n])
def removeIndexes(file_name, indexes, type="f"): removed_indexes = [] orig_array = import2dArray(file_name, type) removed_indexes = np.delete(orig_array, indexes, axis=0) write2dArray(removed_indexes, file_name[:-4] + "removedind.txt")
def shorten2dFloats(floats_fn): fa = import2dArray(floats_fn) for a in range(len(fa)): fa[a] = np.around(fa[a], decimals=4) return fa