コード例 #1
0
def extension_indexer(dataset_path, n, write_path):
    allpaths = DFS(dataset_path)

    # a list of all the filenames (without paths)
    filenames = []
    for path in allpaths:
        filenames.append(path_utilities.get_fname_from_path(path))
    filenames_no_ext, exts = extensions.remove_all_extensions(filenames)

    sorted_tuple = extensions.count_and_sort_exts(exts, n, write_path,
                                                  dataset_path)
    sorted_exts, sorted_counts = sorted_tuple
    top_n_exts = sorted_exts  #UNCOMMENT FOR ONLY CONVERTING TOP N EXTS [:n]

    # makes a dictionary key for each of the top extensions
    ext_locations = {}
    for extension in top_n_exts:
        ext_locations.update({extension: []})

    # checks every file and saves the paths of those with the top extensions
    # in a dict called "ext_locations"
    for fp in allpaths:
        fn = path_utilities.get_fname_from_path(fp)
        if fn[:2] != "._":
            ext = path_utilities.get_single_extension(fn)
            if ext in top_n_exts:
                ext_list = ext_locations.get(ext)
                ext_list.append(fp)
                ext_locations.update({ext: ext_list})

    dataset_name = path_utilities.get_last_dir_from_path(dataset_path)
    ext_write_path = os.path.join(write_path,
                                  "extension_index_" + dataset_name + ".npy")
    np.save(ext_write_path, ext_locations)

    return ext_locations
コード例 #2
0
def plot_extensions(dataset_path, num_extensions):
    
    allpaths = DFS.DFS(dataset_path) 
    p = Path(os.getcwd()).parent
    dataset_name = path_utilities.get_last_dir_from_path(dataset_path)
    write_path = os.path.join(p, "outputs/", dataset_name + "--output/")
    if not os.path.isdir(write_path):
        os.mkdir(write_path)

    # a list of all the file names (without the paths)
    filenames = []
    for path in allpaths:
        filenames.append(path_utilities.get_fname_from_path(path))
    filenames_no_ext, exts = remove_all_extensions(filenames) 
    plot_extension_pie(exts, num_extensions, write_path, dataset_path)

    '''
コード例 #3
0
def handle_compressed(comp_paths, dest):
    num_comp = len(comp_paths)
    output_dir = os.path.join(dest, "compressed")
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    for path in comp_paths:
        filename = path_utilities.get_fname_from_path(path)
        p = subprocess.call([
            "gzip -d -q -k -f", filename, ">",
            os.path.join(dest, path_utilities.remove_extension(filename))
        ],
                            cwd=path_utilities.remove_path_end(path),
                            shell=True)
        p.wait()
        p2 = subprocess.call(
            ["rm", path_utilities.remove_extension(filename)],
            cwd=path_utilities.remove_path_end(path),
            shell=True)
        p2.wait()
コード例 #4
0
def get_valid_filenames_struct(dir_list):
    #print("Size of virtual directory: ", len(dir_list))
    list_valid_exts = ["xls", "xlsx", "tsv"]
    valid_list = []
    # for each filename in the directory...
    for path in dir_list:
        filename = path_utilities.get_fname_from_path(path)
        length = len(filename)
        valid = False
        extension = path_utilities.get_single_extension(filename).lower()
        if extension in list_valid_exts:
            valid_list.append(path)
            valid = True
        if (valid == False):
            print(extension)
            print("This filename is invalid: ", filename)

    #print("There are ", len(valid_list), " candidates for conversion. ")
    return valid_list
コード例 #5
0
def get_valid_filenames_struct(dir_list):
    print("size of virtual directory: ", len(dir_list))
    list_valid_exts = [".xls", ".xlsx", ".tsv"]
    list_caps_exts = {".XLS": ".xls", ".XLSX": ".xlsx", ".TSV": ".tsv"}
    valid_list = []
    # for each filename in the directory...
    for path in tqdm(dir_list):
        filename = path_utilities.get_fname_from_path(path)
        length = len(filename)
        valid = False
        # we iterate on the characters starting from end of the string
        pos = length - 1
        # dot pos will be position of the first period from the end,
        # i.e. the file extension dot. If it is still "length" at end,
        # we know that there are no dots in the filename.
        dot_pos = length
        while (pos >= 0):
            if (filename[pos] == "."):
                dot_pos = pos
                break
            pos = pos - 1
        # if there is a dot somewhere in filename, i.e. if there is an
        # extension...
        if (dot_pos < length):
            extension = filename[dot_pos:length]
            if extension in list_valid_exts:
                valid_list.append(path)
                valid = True
            # if the extension is an ALLCAPS version...
            elif extension in list_caps_exts.keys():
                #new_filename = filename[0:dot_pos]
                # + list_caps_exts[extension]
                # change it to lowercase and add it to valid_list
                #os.system("mv " + os.path.join(dataset_path, filename)
                # + " " + os.path.join(dataset_path, new_filename))
                valid_list.append(path)
                valid = True
        if (valid == False):
            print(extension)
            print("This filename is invalid: ", filename)

    print("There are ", len(valid_list), " candidates for conversion. ")
    return valid_list