def _build_two_column_index(f4_file_path, index_column_1, index_column_2, verbose): if not isinstance(index_column_1, str) or not isinstance( index_column_1, str): raise Exception( "When specifying an index column name, it must be a string.") f4py.print_message( f"Saving index for {index_column_1} and {index_column_2} for {f4_file_path}.", verbose) num_rows = f4py.read_int_from_file(f4_file_path, ".nrow") index_name = "____".join([index_column_1, index_column_2]) with f4py.Parser(f4_file_path) as parser: f4py.print_message( f"Getting column meta information for {index_name} index and {f4_file_path}.", verbose) ignore, column_index_dict, column_type_dict, column_coords_dict = parser._get_column_meta( f4py.NoFilter(), [index_column_1, index_column_2]) #TODO: Add logic to verify that index_column is valid. file_handle = parser.get_file_handle("") line_length = parser.get_stat(".ll") index_column_1_type = column_type_dict[column_index_dict[ index_column_1.encode()]] index_column_2_type = column_type_dict[column_index_dict[ index_column_2.encode()]] coords_1 = column_coords_dict[column_index_dict[ index_column_1.encode()]] coords_2 = column_coords_dict[column_index_dict[ index_column_2.encode()]] values_positions = [] f4py.print_message( f"Parsing values and positions for {index_name} index and {f4_file_path}.", verbose) for row_index in range(parser.get_num_rows()): value_1 = parser._parse_row_value(row_index, coords_1, line_length, file_handle) value_2 = parser._parse_row_value(row_index, coords_2, line_length, file_handle) values_positions.append([value_1, value_2, row_index]) f4py.print_message( f"Building index file for {index_name} and {f4_file_path}.", verbose) IndexHelper._customize_values_positions( values_positions, [index_column_1_type, index_column_2_type], f4py.sort_first_two_columns, f4py.do_nothing) index_file_path = IndexHelper._get_index_file_path( parser.data_file_path, index_name) IndexHelper._save_index(values_positions, index_file_path) f4py.print_message( f"Done building two-column index file for {index_name} and {f4_file_path}.", verbose)
def _get_column_meta(self, fltr, select_columns): if len(select_columns) == 0: with f4py.Parser(self.data_file_path + ".cn", fixed_file_extensions=["", ".cc"], stats_file_extensions=[".ll", ".mccl"]) as cn_parser: line_length = cn_parser.get_stat(".ll") coords = cn_parser._parse_data_coords([0, 1]) # They are not in sorted order in the file, so we must put them in a dict and sort it. column_index_dict = {} for row_index in range(self.get_num_cols()): values = cn_parser.__parse_row_values(row_index, coords) column_index_dict[fastnumbers.fast_int( values[1])] = values[0] select_columns = [] for index, name in sorted(column_index_dict.items()): select_columns.append(name) column_index_dict = { name: index for index, name in enumerate(select_columns) } else: select_columns = [x.encode() for x in select_columns] column_names_file_path = f"{self.data_file_path}.cn" with f4py.IndexHelper._get_index_parser( column_names_file_path) as index_parser: column_index_dict = { name: self._get_column_index_from_name(index_parser, name.decode()) for name in fltr.get_column_name_set() | set(select_columns) } type_columns = fltr.get_column_name_set() | set(select_columns) filter_column_type_dict = {} for column_name in type_columns: column_index = column_index_dict[column_name] filter_column_type_dict[column_index] = self.get_column_type( column_index) column_indices = list(column_index_dict.values()) column_coords = self._parse_data_coords(column_indices) column_coords_dict = {} for i in range(len(column_indices)): column_coords_dict[column_indices[i]] = column_coords[i] return select_columns, column_index_dict, filter_column_type_dict, column_coords_dict
def _build_one_column_index(f4_file_path, index_column, verbose, custom_index_function): f4py.print_message( f"Saving index for {f4_file_path} and {index_column}.", verbose) num_rows = f4py.read_int_from_file(f4_file_path, ".nrow") with f4py.Parser(f4_file_path) as parser: f4py.print_message( f"Getting column meta information for {index_column} index for {f4_file_path}.", verbose) ignore, column_index_dict, column_type_dict, column_coords_dict = parser._get_column_meta( f4py.NoFilter(), [index_column]) #TODO: Add logic to verify that index_column is valid. file_handle = parser.get_file_handle("") line_length = parser.get_stat(".ll") index_column_type = column_type_dict[column_index_dict[ index_column.encode()]] coords = column_coords_dict[column_index_dict[ index_column.encode()]] values_positions = [] f4py.print_message( f"Parsing values and positions for {index_column} index for {f4_file_path}.", verbose) for row_index in range(parser.get_num_rows()): value = parser._parse_row_value(row_index, coords, line_length, file_handle) values_positions.append([value, row_index]) f4py.print_message( f"Building index file for {index_column} index for {f4_file_path}.", verbose) IndexHelper._customize_values_positions(values_positions, [index_column_type], f4py.sort_first_column, custom_index_function) index_file_path = IndexHelper._get_index_file_path( parser.data_file_path, index_column, custom_index_function) IndexHelper._save_index(values_positions, index_file_path) f4py.print_message( f"Done building index file for {index_column} index for {f4_file_path}.", verbose)
def filter_column_values(self, data_file_path, row_indices, column_index_dict, column_type_dict, column_coords_dict): with f4py.Parser(data_file_path, fixed_file_extensions=[""], stats_file_extensions=[".ll"]) as parser: line_length = parser.get_stat(".ll") coords = column_coords_dict[column_index_dict[self.column_name]] data_file_handle = parser.get_file_handle("") passing_row_indices = set() for i in row_indices: if self.passes( parser._parse_row_value(i, coords, line_length, data_file_handle)): passing_row_indices.add(i) return passing_row_indices
def _get_num_rows(self, data_file_path): with f4py.Parser(data_file_path, fixed_file_extensions=[""], stats_file_extensions=[".nrow"]) as parser: return parser.get_num_rows()
def run_test(description, tall_or_wide, select_columns, discrete_filter1, discrete_filter2, float_filter, indexed, compression_level, use_training_dict, num_processes, lines_per_chunk, expected_size): if not indexed and compression_level != None: return f4_file_path = f"data/{tall_or_wide}_" if indexed: f4_file_path += "indexed_" else: f4_file_path += "notindexed_" f4_file_path += f"{compression_level}_" if use_training_dict: f4_file_path += "cmpd.f4" else: f4_file_path += "nocmpd.f4" out_file_path = f"/tmp/{os.path.basename(f4_file_path)}" start = time.time() fltr = f4py.AndFilter(f4py.OrFilter(discrete_filter1, discrete_filter2), float_filter) with f4py.Parser(f4_file_path) as parser: parser.query_and_save(fltr, select_columns, out_file_path, out_file_type="tsv", num_processes=num_processes, lines_per_chunk=lines_per_chunk) file_size = os.path.getsize(out_file_path) if file_size != expected_size: print( f"ERROR: Size of {out_file_path} was {file_size}, but it was expected to be {expected_size}." ) sys.exit() end = time.time() elapsed = f"{round(end - start, 3)}" output = f"{description}\t{tall_or_wide}\t" if indexed: output += "Yes\t" else: output += "No\t" output += f"{compression_level}\t" if use_training_dict: output += "Yes\t" else: output += "No\t" output += f"{num_processes}\t{elapsed}" print(output)
def _get_index_parser(index_file_path): return f4py.Parser(index_file_path, fixed_file_extensions=["", ".cc"], stats_file_extensions=[".ll", ".mccl"])