def getWordVectors(vector_save_fn, words_fn, wvn, wv_amt, svm_dir_fn=None): if os.path.exists(vector_save_fn) is False: glove_file = datapath('/home/tom/Downloads/glove.6B/glove.6B.' + str(wv_amt) + 'd.txt') tmp_file = get_tmpfile( "/home/tom/Downloads/glove.6B/test_word2vec.txt") glove2word2vec(glove_file, tmp_file) svm_dir = import2dArray(svm_dir_fn) all_vectors = KeyedVectors.load_word2vec_format(tmp_file) vectors = [] words = import1dArray(words_fn) for w in range(len(words)): try: if svm_dir_fn is None: vectors.append(all_vectors.get_vector(words[w])) else: vectors.append( np.concatenate( [all_vectors.get_vector(words[w]), svm_dir[w]])) except KeyError: if svm_dir_fn is None: vectors.append(np.zeros(wv_amt)) else: vectors.append(np.zeros(wv_amt + len(svm_dir[0]))) write2dArray(vectors, vector_save_fn) write1dArray(words, wvn) else: print("Already got word vectors", vector_save_fn)
def compileSVMResults(file_name, chunk_amt, data_type): if fileExists("../data/" + data_type + "/svm/directions/" + file_name + ".txt") is False: print("Compiling SVM results") randomcount = 0 directions = [] for c in range(chunk_amt): directions.append("../data/" + data_type + "/svm/directions/" + file_name + " CID" + str(c) + " CAMT" + str(chunk_amt) + ".txt") kappa = [] for c in range(chunk_amt): kappa.append("../data/" + data_type + "/svm/kappa/" + file_name + " CID" + str(c) + " CAMT" + str(chunk_amt) + ".txt") for f in directions: while not fileExists(f): time.sleep(10) time.sleep(10) di = [] for d in directions: di.extend(import2dArray(d)) ka = [] for k in kappa: ka.extend(import1dArray(k)) write2dArray( di, "../data/" + data_type + "/svm/directions/" + file_name + ".txt") write1dArray( ka, "../data/" + data_type + "/svm/kappa/" + file_name + ".txt") else: print("Skipping compile")
def convertToPPMI(freq_arrays_fn, term_names_fn): freq_arrays = np.asarray(import2dArray(freq_arrays_fn, "i")) term_names = import1dArray(term_names_fn) ppmi_arrays = [] overall = 0.0 for f in freq_arrays: overall += sum(f) entity_array = [0] * 15000 # For each term for t in range(len(freq_arrays)): ppmi_array = [] term = sum(freq_arrays[t, :]) term_p = term / overall for e in range(len(freq_arrays[t])): ppmi = 0.0 freq = freq_arrays[t][e] if freq != 0: freq_p = freq / overall if entity_array[e] == 0: entity = sum(freq_arrays[:, e]) entity_p = entity / overall entity_array[e] = entity_p proba = freq_p / (entity_array[e] * term_p) ppmi = np.amax([0.0, np.log(proba)]) ppmi_array.append(ppmi) print(ppmi_array) ppmi_arrays.append(ppmi_array) write1dArray(ppmi_array, "../data/movies/bow/ppmi/class-" + term_names[t]) write2dArray(ppmi_arrays, "../data/movies/bow/ppmi/class-all")
def writeClassAll(class_fn, full_phrases_fn, phrases_used_fn, file_name): full_phrases = import1dArray(full_phrases_fn) #ppmi = np.asarray(import2dArray(class_fn)).transpose() ppmi = import2dArray(class_fn) new_ppmi = [] phrases_used = import1dArray(phrases_used_fn) for p in range(len(full_phrases)): for pi in range(len(phrases_used)): if full_phrases[p] == phrases_used[pi]: new_ppmi.append(ppmi[p]) break write2dArray(new_ppmi, file_name)
def scaleSpaceUnitVector(space, file_name): space = np.asarray(space).transpose() print(len(space), len(space[0])) scaled_vector = [] for v in space: if np.sum(v) != 0: norm = normalize(v) scaled_vector.append(norm) else: scaled_vector.append(v) space = space.transpose() write2dArray(scaled_vector, file_name)
def concatenateArrays(arrays, file_name): new_array = arrays[0] for a in range(1, len(arrays)): new_array = np.concatenate((new_array, arrays[a]), axis=0) write2dArray(new_array, file_name)
def removeIndexes(file_name, indexes, type="f"): removed_indexes = [] orig_array = import2dArray(file_name, type) removed_indexes = np.delete(orig_array, indexes, axis=0) write2dArray(removed_indexes, file_name[:-4] + "removedind.txt")
def scaleSpace(space, lower_bound, upper_bound, file_name): minmax_scale = MinMaxScaler(feature_range=(lower_bound, upper_bound), copy=True) space = minmax_scale.fit_transform(space) write2dArray(space, file_name) return space
""" DATA EDITING TASKS """ def splitData(training_data, movie_vectors, movie_labels): x_train = np.asarray(movie_vectors[:training_data]) y_train = np.asarray(movie_labels[:training_data]) x_test = np.asarray(movie_vectors[training_data:]) y_test = np.asarray(movie_labels[training_data:]) return x_train, y_train, x_test, y_test """ a = import2dArray("D:\Eclipse\MDS/class-all-30-18836-alldm", "f") a = np.nan_to_num(a) write2dArray(a, "class-all-30-18836-alldmnTn") """ """ mds = import2dArray("../data/newsgroups/nnet/spaces/mds.txt") mds = mds.transpose() write2dArray(mds, "../data/newsgroups/nnet/spaces/mds.txt") """