def extension_indexer(dataset_path, n, write_path): allpaths = DFS(dataset_path) # a list of all the filenames (without paths) filenames = [] for path in allpaths: filenames.append(path_utilities.get_fname_from_path(path)) filenames_no_ext, exts = extensions.remove_all_extensions(filenames) sorted_tuple = extensions.count_and_sort_exts(exts, n, write_path, dataset_path) sorted_exts, sorted_counts = sorted_tuple top_n_exts = sorted_exts #UNCOMMENT FOR ONLY CONVERTING TOP N EXTS [:n] # makes a dictionary key for each of the top extensions ext_locations = {} for extension in top_n_exts: ext_locations.update({extension: []}) # checks every file and saves the paths of those with the top extensions # in a dict called "ext_locations" for fp in allpaths: fn = path_utilities.get_fname_from_path(fp) if fn[:2] != "._": ext = path_utilities.get_single_extension(fn) if ext in top_n_exts: ext_list = ext_locations.get(ext) ext_list.append(fp) ext_locations.update({ext: ext_list}) dataset_name = path_utilities.get_last_dir_from_path(dataset_path) ext_write_path = os.path.join(write_path, "extension_index_" + dataset_name + ".npy") np.save(ext_write_path, ext_locations) return ext_locations
def plot_extensions(dataset_path, num_extensions): allpaths = DFS.DFS(dataset_path) p = Path(os.getcwd()).parent dataset_name = path_utilities.get_last_dir_from_path(dataset_path) write_path = os.path.join(p, "outputs/", dataset_name + "--output/") if not os.path.isdir(write_path): os.mkdir(write_path) # a list of all the file names (without the paths) filenames = [] for path in allpaths: filenames.append(path_utilities.get_fname_from_path(path)) filenames_no_ext, exts = remove_all_extensions(filenames) plot_extension_pie(exts, num_extensions, write_path, dataset_path) '''
def handle_compressed(comp_paths, dest): num_comp = len(comp_paths) output_dir = os.path.join(dest, "compressed") if not os.path.isdir(output_dir): os.mkdir(output_dir) for path in comp_paths: filename = path_utilities.get_fname_from_path(path) p = subprocess.call([ "gzip -d -q -k -f", filename, ">", os.path.join(dest, path_utilities.remove_extension(filename)) ], cwd=path_utilities.remove_path_end(path), shell=True) p.wait() p2 = subprocess.call( ["rm", path_utilities.remove_extension(filename)], cwd=path_utilities.remove_path_end(path), shell=True) p2.wait()
def get_valid_filenames_struct(dir_list): #print("Size of virtual directory: ", len(dir_list)) list_valid_exts = ["xls", "xlsx", "tsv"] valid_list = [] # for each filename in the directory... for path in dir_list: filename = path_utilities.get_fname_from_path(path) length = len(filename) valid = False extension = path_utilities.get_single_extension(filename).lower() if extension in list_valid_exts: valid_list.append(path) valid = True if (valid == False): print(extension) print("This filename is invalid: ", filename) #print("There are ", len(valid_list), " candidates for conversion. ") return valid_list
def get_valid_filenames_struct(dir_list): print("size of virtual directory: ", len(dir_list)) list_valid_exts = [".xls", ".xlsx", ".tsv"] list_caps_exts = {".XLS": ".xls", ".XLSX": ".xlsx", ".TSV": ".tsv"} valid_list = [] # for each filename in the directory... for path in tqdm(dir_list): filename = path_utilities.get_fname_from_path(path) length = len(filename) valid = False # we iterate on the characters starting from end of the string pos = length - 1 # dot pos will be position of the first period from the end, # i.e. the file extension dot. If it is still "length" at end, # we know that there are no dots in the filename. dot_pos = length while (pos >= 0): if (filename[pos] == "."): dot_pos = pos break pos = pos - 1 # if there is a dot somewhere in filename, i.e. if there is an # extension... if (dot_pos < length): extension = filename[dot_pos:length] if extension in list_valid_exts: valid_list.append(path) valid = True # if the extension is an ALLCAPS version... elif extension in list_caps_exts.keys(): #new_filename = filename[0:dot_pos] # + list_caps_exts[extension] # change it to lowercase and add it to valid_list #os.system("mv " + os.path.join(dataset_path, filename) # + " " + os.path.join(dataset_path, new_filename)) valid_list.append(path) valid = True if (valid == False): print(extension) print("This filename is invalid: ", filename) print("There are ", len(valid_list), " candidates for conversion. ") return valid_list