def create_one_hot_vecs_from_folder(V, dir_path): files = get_files_of_folder(dir_path, "txt") ret = [] for f in files: w = get_words_from_file(f) hot = create_one_hot_vec(V, w) ret.append(hot) print("finished: " + f) return np.array(ret)
def create_indexes_matrix_from_folder(V, dir_path): files = get_files_of_folder(dir_path, "txt") ret = {} for f in files: w = get_words_from_file(f) ind = get_index(V, w) key = f.split("/")[-1].replace(".txt", "") ret[key] = ind print("finished: " + f) return ret
def create_distinct_words_from_folder(train_dir_path, test_dir_path): ret = [] files = get_files_of_folder(train_dir_path, "txt") files.extend(get_files_of_folder(test_dir_path, "txt")) for f in files: words = get_words_from_file(f) for w in words: if w not in ret: ret.append(w) return np.array(ret)
def create_indexes_matrix_from_file_multithread(V, file, output): w = get_words_from_file(file) ind = get_index(V, w) key = file.split("/")[-1].replace(".txt", "") output.put((key, ind)) print("finished: " + file)