def main(hdf5_file_name, paths=None):

    hf5p = h5py.File(hdf5_file_name, "r")
    if paths is None:
        paths = get_all_paths(hf5p["/"])

    else:
        expanded_paths = []
        for path in paths:
            expanded_paths += get_all_paths(hf5p[path])
        paths = expanded_paths

    annotations_paths = [hp for hp in paths if "column_annotations" in hp]
    core_array_paths = [hp for hp in paths if "core_array" in hp]
    annotations_list = []

    number_of_rows = hf5p[core_array_paths[0]].shape[0]

    i = 0
    for path in annotations_paths:
        annotations = hf5p[path][...]
        nrows, ncolumns = annotations.shape
        for j in range(ncolumns):
            field_name = ""
            for k in range(nrows):
                if len(annotations[k, j].strip()) > 0:
                    temp_string = annotations[k, j] + "."
                    field_name += temp_string
            field_name = field_name[:-1]
            annotations_list += [{
                "path": core_array_paths[i],
                "index": j,
                "field_name": field_name
            }]
        i += 1
    import pprint
    pprint.pprint(annotations_list)
    header = [ca["field_name"] for ca in annotations_list]
    with open(hdf5_file_name + ".csv", "wb") as fw:
        csv_writer = csv.writer(fw)
        csv_writer.writerow(header)

        for l in range(number_of_rows):
            row_to_write = []
            paths_core_vector = {}
            for path in core_array_paths:
                paths_core_vector[path] = hf5p[path][l, :]

            for annotation in annotations_list:
                row_to_write += [
                    paths_core_vector[annotation["path"]][annotation["index"]]
                ]
            csv_writer.writerow(row_to_write)

            if l % 1000 == 0 and l > 0:
                print("Wrote %s rows" % l)
def main(hdf5_file_name, paths=None):

    hf5p = h5py.File(hdf5_file_name, "r")
    if paths is None:
        paths = get_all_paths(hf5p["/"])

    else:
        expanded_paths = []
        for path in paths:
            expanded_paths += get_all_paths(hf5p[path])
        paths = expanded_paths

    annotations_paths = [hp for hp in paths if "column_annotations" in hp]
    core_array_paths = [hp for hp in paths if "core_array" in hp]
    annotations_list = []

    number_of_rows = hf5p[core_array_paths[0]].shape[0]

    i = 0
    for path in annotations_paths:
        annotations = hf5p[path][...]
        nrows, ncolumns = annotations.shape
        for j in range(ncolumns):
            field_name = ""
            for k in range(nrows):
                if len(annotations[k,j].strip()) > 0:
                    temp_string = annotations[k,j] + "."
                    field_name += temp_string
            field_name = field_name[:-1]
            annotations_list += [{"path": core_array_paths[i], "index": j, "field_name": field_name}]
        i += 1
    import pprint
    pprint.pprint(annotations_list)
    header = [ca["field_name"] for ca in annotations_list]
    with open(hdf5_file_name + ".csv", "wb") as fw:
        csv_writer = csv.writer(fw)
        csv_writer.writerow(header)

        for l in range(number_of_rows):
            row_to_write = []
            paths_core_vector = {}
            for path in core_array_paths:
                paths_core_vector[path] = hf5p[path][l, :]

            for annotation in annotations_list:
                row_to_write += [paths_core_vector[annotation["path"]][annotation["index"]]]
            csv_writer.writerow(row_to_write)

            if l % 1000 == 0 and l > 0:
                print("Wrote %s rows" % l)
def main(starting_directory="X:\\healthfacts\\20160808\\"):

    file_summary_csv = os.path.join(starting_directory, "hdf5_files_summary.csv")

    header = ["full_directory", "directory", "file_name", "hdf5_path", "number_of_rows", "number_of_columns", "number_of_cells", "non_zero_entries", "fraction_non_zero"]
    with open(file_summary_csv, "wb") as fw:
        csv_writer = csv.writer(fw)
        csv_writer.writerow(header)
        for dir_name, subdir_list, file_list in os.walk(starting_directory):

            for file_name in file_list:
                base_name, ext  = os.path.splitext(file_name)
                if ext == ".hdf5":
                    hdf5_file_name = os.path.join(dir_name, file_name)

                    h5 = h5py.File(hdf5_file_name)
                    group_paths = upx.get_all_paths(h5["/"])

                    if group_paths is not None:
                        for group_path in group_paths:
                            if group_path.split("/")[-1] == "core_array":
                               numeric_array = h5[group_path]
                               non_zero = np.where(numeric_array[...] > 0)
                               n_rows, n_columns = numeric_array.shape
                               n_cells = n_rows * n_columns
                               n_non_zero = len(non_zero[0])
                               if n_cells is None or n_cells == 0:
                                   fraction_non_zero = None
                               else:
                                   fraction_non_zero = 1.0 * n_non_zero / n_cells

                               row_to_write = [dir_name, os.path.split(dir_name)[-1], file_name, group_path, n_rows, n_columns, n_cells, n_non_zero, fraction_non_zero]
                               print(row_to_write)
                               csv_writer.writerow(row_to_write)
Esempio n. 4
0
def main(hdf5_file_name, csv_file_name=None, threshold_value_to_include=0.01):
    fp5 = h5py.File(hdf5_file_name)

    paths = get_all_paths(fp5["/"])

    core_array_paths = [p for p in paths if p.split("/")[-1] == "core_array"]

    stripped_split_paths = [p.split("/")[:-1] for p in core_array_paths]
    stripped_paths = ["/".join(sp) for sp in stripped_split_paths]

    if csv_file_name is None:
        csv_file_name = hdf5_file_name + ".summary.csv"

        with open(csv_file_name, "wb") as fw:
            csv_writer = csv.writer(fw)
            header = [
                "path", "c1", "c2", "c3", "non-zero", "to_include",
                "fraction non-zero"
            ]
            csv_writer.writerow(header)
            for stripped_path in stripped_paths:
                print(stripped_path)
                column_annotation_path = stripped_path + "/column_annotations"
                core_array_path = stripped_path + "/core_array"
                column_annotations = fp5[column_annotation_path][...]

                for j in range(column_annotations.shape[1]):
                    slice_of_interest = fp5[core_array_path][:, j]
                    number_of_rows = slice_of_interest.shape[0]
                    non_zero_values = np.where(slice_of_interest > 0)
                    n_non_zero_values = len(non_zero_values[0])
                    column_name_to_write = column_annotations[:, j]
                    column_names = column_name_to_write.transpose().tolist()

                    fraction_non_zero = (1.0 *
                                         n_non_zero_values) / number_of_rows
                    if fraction_non_zero >= threshold_value_to_include:
                        to_include = "1"
                    else:
                        to_include = ""

                    row_to_write = [stripped_path] + column_names + [
                        n_non_zero_values, to_include, fraction_non_zero
                    ]

                    csv_writer.writerow(row_to_write)
def main(hdf5_file_name, csv_file_name=None, threshold_value_to_include=0.01):
    fp5 = h5py.File(hdf5_file_name)

    paths = get_all_paths(fp5["/"])

    core_array_paths = [p for p in paths if p.split("/")[-1] == "core_array"]

    stripped_split_paths = [p.split("/")[:-1] for p in core_array_paths]
    stripped_paths = ["/".join(sp) for sp in stripped_split_paths]

    if csv_file_name is None:
        csv_file_name = hdf5_file_name + ".summary.csv"

        with open(csv_file_name, "wb") as fw:
            csv_writer = csv.writer(fw)
            header = ["path", "c1", "c2", "c3", "non-zero", "to_include", "fraction non-zero"]
            csv_writer.writerow(header)
            for stripped_path in stripped_paths:
                print(stripped_path)
                column_annotation_path = stripped_path + "/column_annotations"
                core_array_path = stripped_path + "/core_array"
                column_annotations = fp5[column_annotation_path][...]

                for j in range(column_annotations.shape[1]):
                    slice_of_interest = fp5[core_array_path][:, j]
                    number_of_rows = slice_of_interest.shape[0]
                    non_zero_values = np.where(slice_of_interest > 0)
                    n_non_zero_values = len(non_zero_values[0])
                    column_name_to_write = column_annotations[:,j]
                    column_names = column_name_to_write.transpose().tolist()

                    fraction_non_zero = (1.0 * n_non_zero_values) / number_of_rows
                    if fraction_non_zero >= threshold_value_to_include:
                        to_include = "1"
                    else:
                        to_include = ""

                    row_to_write = [stripped_path] + column_names + [n_non_zero_values, to_include, fraction_non_zero]

                    csv_writer.writerow(row_to_write)
Esempio n. 6
0
def main(file_name):
    f5 = h5py.File(file_name, "r")
    all_paths = upx.get_all_paths(f5["./"])

    column_annotation_paths = [p for p in all_paths if p[ -1 * len("/column_annotations"):] == "/column_annotations"]
    core_array_paths = [c for c in all_paths if c[ -1 * len("/core_array"):] == "/core_array"]
    i = 0

    annotation_csv_dict = {}
    annotation_path_list = []
    for column_annotation_path in column_annotation_paths:
        annotation_path = column_annotation_path[:-1 * len("/column_annotations")]
        print(annotation_path)
        annotation_path_list += [annotation_path]
        annotation_csv_dict[annotation_path] = file_name + "." + str(i) + ".csv"

        column_annotation = f5[column_annotation_path][...]
        column_annotation_list = column_annotation[0, :].tolist()
        column_annotation_fields = column_annotation[1, :].tolist()

        fields = np.unique(column_annotation_list).tolist()

        with open(annotation_csv_dict[annotation_path], "wb") as fw:
            csv_writer = csv.writer(fw)
            csv_writer.writerow(fields)

            core_array_rows = f5[core_array_paths[i]].shape[0]
            #print(core_array_rows)
            #exit()
            #core_array_rows = 1000
            for j in range(core_array_rows):#range(core_array_shape[0]):
                result = {}
                core_array_row = f5[core_array_paths[i]][j, :]
                for k in range(len(column_annotation_list)):
                    if core_array_row[k] == 1:
                        result[column_annotation_list[k]] = column_annotation_fields[k]
                    else:
                        if core_array_row[k] > 0:
                            if column_annotation_list[k] not in result:
                                if int(core_array_row[k]) - core_array_row[k] > 0.0:
                                    result[column_annotation_list[k]] = core_array_row[k]
                                else:
                                    result[column_annotation_list[k]] = int(core_array_row[k])
                if j % 1000 == 0 and j > 0:
                    print("Read '%s' rows" % j)

                row_to_write = []
                for field in fields:
                    if field in result:
                        row_to_write += [result[field]]
                    else:
                        row_to_write += ['']

                csv_writer.writerow(row_to_write)
        i += 1

    # Build a single CSV file
    csv_dict_path = {}
    for annotation_path in annotation_csv_dict:
        f = open(annotation_csv_dict[annotation_path], "rb")
        csv_dict_path[annotation_path] = csv.reader(f)

    header = []
    for annotation_path in annotation_path_list:
        ap_header = csv_dict_path[annotation_path].next()
        header += ap_header

    master_csv_file_name = file_name + ".recode.csv"
    with open(master_csv_file_name, "wb") as fw:
        master_csv_writer = csv.writer(fw)
        master_csv_writer.writerow(header)

        for i in range(core_array_rows):
            master_row = []
            for path in annotation_path_list:
                master_row += csv_dict_path[path].next()
            master_csv_writer.writerow(master_row)
Esempio n. 7
0
def main(file_name):
    f5 = h5py.File(file_name, "r")
    all_paths = upx.get_all_paths(f5["./"])

    column_annotation_paths = [
        p for p in all_paths
        if p[-1 * len("/column_annotations"):] == "/column_annotations"
    ]
    core_array_paths = [
        c for c in all_paths if c[-1 * len("/core_array"):] == "/core_array"
    ]
    i = 0

    annotation_csv_dict = {}
    annotation_path_list = []
    for column_annotation_path in column_annotation_paths:
        annotation_path = column_annotation_path[:-1 *
                                                 len("/column_annotations")]
        print(annotation_path)
        annotation_path_list += [annotation_path]
        annotation_csv_dict[annotation_path] = file_name + "." + str(
            i) + ".csv"

        column_annotation = f5[column_annotation_path][...]
        column_annotation_list = column_annotation[0, :].tolist()
        column_annotation_fields = column_annotation[1, :].tolist()

        fields = np.unique(column_annotation_list).tolist()

        with open(annotation_csv_dict[annotation_path], "wb") as fw:
            csv_writer = csv.writer(fw)
            csv_writer.writerow(fields)

            core_array_rows = f5[core_array_paths[i]].shape[0]
            #print(core_array_rows)
            #exit()
            #core_array_rows = 1000
            for j in range(core_array_rows):  #range(core_array_shape[0]):
                result = {}
                core_array_row = f5[core_array_paths[i]][j, :]
                for k in range(len(column_annotation_list)):
                    if core_array_row[k] == 1:
                        result[column_annotation_list[
                            k]] = column_annotation_fields[k]
                    else:
                        if core_array_row[k] > 0:
                            if column_annotation_list[k] not in result:
                                if int(core_array_row[k]
                                       ) - core_array_row[k] > 0.0:
                                    result[column_annotation_list[
                                        k]] = core_array_row[k]
                                else:
                                    result[column_annotation_list[k]] = int(
                                        core_array_row[k])
                if j % 1000 == 0 and j > 0:
                    print("Read '%s' rows" % j)

                row_to_write = []
                for field in fields:
                    if field in result:
                        row_to_write += [result[field]]
                    else:
                        row_to_write += ['']

                csv_writer.writerow(row_to_write)
        i += 1

    # Build a single CSV file
    csv_dict_path = {}
    for annotation_path in annotation_csv_dict:
        f = open(annotation_csv_dict[annotation_path], "rb")
        csv_dict_path[annotation_path] = csv.reader(f)

    header = []
    for annotation_path in annotation_path_list:
        ap_header = csv_dict_path[annotation_path].next()
        header += ap_header

    master_csv_file_name = file_name + ".recode.csv"
    with open(master_csv_file_name, "wb") as fw:
        master_csv_writer = csv.writer(fw)
        master_csv_writer.writerow(header)

        for i in range(core_array_rows):
            master_row = []
            for path in annotation_path_list:
                master_row += csv_dict_path[path].next()
            master_csv_writer.writerow(master_row)