Example #1
0
    def _build_two_column_index(f4_file_path, index_column_1, index_column_2,
                                verbose):
        if not isinstance(index_column_1, str) or not isinstance(
                index_column_1, str):
            raise Exception(
                "When specifying an index column name, it must be a string.")

        f4py.print_message(
            f"Saving index for {index_column_1} and {index_column_2} for {f4_file_path}.",
            verbose)

        num_rows = f4py.read_int_from_file(f4_file_path, ".nrow")
        index_name = "____".join([index_column_1, index_column_2])

        with f4py.Parser(f4_file_path) as parser:
            f4py.print_message(
                f"Getting column meta information for {index_name} index and {f4_file_path}.",
                verbose)
            ignore, column_index_dict, column_type_dict, column_coords_dict = parser._get_column_meta(
                f4py.NoFilter(), [index_column_1, index_column_2])
            #TODO: Add logic to verify that index_column is valid.

            file_handle = parser.get_file_handle("")
            line_length = parser.get_stat(".ll")

            index_column_1_type = column_type_dict[column_index_dict[
                index_column_1.encode()]]
            index_column_2_type = column_type_dict[column_index_dict[
                index_column_2.encode()]]
            coords_1 = column_coords_dict[column_index_dict[
                index_column_1.encode()]]
            coords_2 = column_coords_dict[column_index_dict[
                index_column_2.encode()]]

            values_positions = []
            f4py.print_message(
                f"Parsing values and positions for {index_name} index and {f4_file_path}.",
                verbose)
            for row_index in range(parser.get_num_rows()):
                value_1 = parser._parse_row_value(row_index, coords_1,
                                                  line_length, file_handle)
                value_2 = parser._parse_row_value(row_index, coords_2,
                                                  line_length, file_handle)
                values_positions.append([value_1, value_2, row_index])

            f4py.print_message(
                f"Building index file for {index_name} and {f4_file_path}.",
                verbose)
            IndexHelper._customize_values_positions(
                values_positions, [index_column_1_type, index_column_2_type],
                f4py.sort_first_two_columns, f4py.do_nothing)

            index_file_path = IndexHelper._get_index_file_path(
                parser.data_file_path, index_name)
            IndexHelper._save_index(values_positions, index_file_path)

        f4py.print_message(
            f"Done building two-column index file for {index_name} and {f4_file_path}.",
            verbose)
Example #2
0
    def _get_column_meta(self, fltr, select_columns):
        if len(select_columns) == 0:
            with f4py.Parser(self.data_file_path + ".cn",
                             fixed_file_extensions=["", ".cc"],
                             stats_file_extensions=[".ll",
                                                    ".mccl"]) as cn_parser:
                line_length = cn_parser.get_stat(".ll")
                coords = cn_parser._parse_data_coords([0, 1])

                # They are not in sorted order in the file, so we must put them in a dict and sort it.
                column_index_dict = {}
                for row_index in range(self.get_num_cols()):
                    values = cn_parser.__parse_row_values(row_index, coords)

                    column_index_dict[fastnumbers.fast_int(
                        values[1])] = values[0]

                select_columns = []
                for index, name in sorted(column_index_dict.items()):
                    select_columns.append(name)

                column_index_dict = {
                    name: index
                    for index, name in enumerate(select_columns)
                }
        else:
            select_columns = [x.encode() for x in select_columns]

            column_names_file_path = f"{self.data_file_path}.cn"

            with f4py.IndexHelper._get_index_parser(
                    column_names_file_path) as index_parser:
                column_index_dict = {
                    name:
                    self._get_column_index_from_name(index_parser,
                                                     name.decode())
                    for name in fltr.get_column_name_set()
                    | set(select_columns)
                }

        type_columns = fltr.get_column_name_set() | set(select_columns)
        filter_column_type_dict = {}
        for column_name in type_columns:
            column_index = column_index_dict[column_name]
            filter_column_type_dict[column_index] = self.get_column_type(
                column_index)

        column_indices = list(column_index_dict.values())
        column_coords = self._parse_data_coords(column_indices)
        column_coords_dict = {}
        for i in range(len(column_indices)):
            column_coords_dict[column_indices[i]] = column_coords[i]

        return select_columns, column_index_dict, filter_column_type_dict, column_coords_dict
Example #3
0
    def _build_one_column_index(f4_file_path, index_column, verbose,
                                custom_index_function):
        f4py.print_message(
            f"Saving index for {f4_file_path} and {index_column}.", verbose)

        num_rows = f4py.read_int_from_file(f4_file_path, ".nrow")

        with f4py.Parser(f4_file_path) as parser:
            f4py.print_message(
                f"Getting column meta information for {index_column} index for {f4_file_path}.",
                verbose)
            ignore, column_index_dict, column_type_dict, column_coords_dict = parser._get_column_meta(
                f4py.NoFilter(), [index_column])
            #TODO: Add logic to verify that index_column is valid.

            file_handle = parser.get_file_handle("")
            line_length = parser.get_stat(".ll")

            index_column_type = column_type_dict[column_index_dict[
                index_column.encode()]]
            coords = column_coords_dict[column_index_dict[
                index_column.encode()]]

            values_positions = []
            f4py.print_message(
                f"Parsing values and positions for {index_column} index for {f4_file_path}.",
                verbose)
            for row_index in range(parser.get_num_rows()):
                value = parser._parse_row_value(row_index, coords, line_length,
                                                file_handle)
                values_positions.append([value, row_index])

            f4py.print_message(
                f"Building index file for {index_column} index for {f4_file_path}.",
                verbose)
            IndexHelper._customize_values_positions(values_positions,
                                                    [index_column_type],
                                                    f4py.sort_first_column,
                                                    custom_index_function)

            index_file_path = IndexHelper._get_index_file_path(
                parser.data_file_path, index_column, custom_index_function)
            IndexHelper._save_index(values_positions, index_file_path)

        f4py.print_message(
            f"Done building index file for {index_column} index for {f4_file_path}.",
            verbose)
Example #4
0
File: Filters.py Project: srp33/F4
    def filter_column_values(self, data_file_path, row_indices,
                             column_index_dict, column_type_dict,
                             column_coords_dict):
        with f4py.Parser(data_file_path,
                         fixed_file_extensions=[""],
                         stats_file_extensions=[".ll"]) as parser:
            line_length = parser.get_stat(".ll")
            coords = column_coords_dict[column_index_dict[self.column_name]]
            data_file_handle = parser.get_file_handle("")

            passing_row_indices = set()

            for i in row_indices:
                if self.passes(
                        parser._parse_row_value(i, coords, line_length,
                                                data_file_handle)):
                    passing_row_indices.add(i)

            return passing_row_indices
Example #5
0
File: Filters.py Project: srp33/F4
 def _get_num_rows(self, data_file_path):
     with f4py.Parser(data_file_path,
                      fixed_file_extensions=[""],
                      stats_file_extensions=[".nrow"]) as parser:
         return parser.get_num_rows()
Example #6
0
def run_test(description, tall_or_wide, select_columns, discrete_filter1,
             discrete_filter2, float_filter, indexed, compression_level,
             use_training_dict, num_processes, lines_per_chunk, expected_size):
    if not indexed and compression_level != None:
        return

    f4_file_path = f"data/{tall_or_wide}_"

    if indexed:
        f4_file_path += "indexed_"
    else:
        f4_file_path += "notindexed_"

    f4_file_path += f"{compression_level}_"

    if use_training_dict:
        f4_file_path += "cmpd.f4"
    else:
        f4_file_path += "nocmpd.f4"

    out_file_path = f"/tmp/{os.path.basename(f4_file_path)}"

    start = time.time()

    fltr = f4py.AndFilter(f4py.OrFilter(discrete_filter1, discrete_filter2),
                          float_filter)

    with f4py.Parser(f4_file_path) as parser:
        parser.query_and_save(fltr,
                              select_columns,
                              out_file_path,
                              out_file_type="tsv",
                              num_processes=num_processes,
                              lines_per_chunk=lines_per_chunk)

    file_size = os.path.getsize(out_file_path)
    if file_size != expected_size:
        print(
            f"ERROR: Size of {out_file_path} was {file_size}, but it was expected to be {expected_size}."
        )
        sys.exit()

    end = time.time()
    elapsed = f"{round(end - start, 3)}"

    output = f"{description}\t{tall_or_wide}\t"
    if indexed:
        output += "Yes\t"
    else:
        output += "No\t"

    output += f"{compression_level}\t"

    if use_training_dict:
        output += "Yes\t"
    else:
        output += "No\t"

    output += f"{num_processes}\t{elapsed}"

    print(output)
Example #7
0
 def _get_index_parser(index_file_path):
     return f4py.Parser(index_file_path,
                        fixed_file_extensions=["", ".cc"],
                        stats_file_extensions=[".ll", ".mccl"])