def _identifier_at_beginning(path: str, identifier: str, have_header: bool, dialect: Dialect) -> bool: """ Check if the identifier column is at the beginning of the database. """ if is_an_int(identifier): return int(identifier) == 0 if not have_header: raise NamedAttributeButNoHeader() else: header = get_header(path=path, dialect=dialect) if len(header) == 0: raise EmptyHeader(path) return header[0] == identifier
def _class_at_end(path: str, class_name: str, have_header: bool, dialect: Dialect) -> bool: """ Check if the class column is at the end of the database. """ if is_an_int(class_name): return int(class_name) == -1 or \ int(class_name) == get_number_of_columns(path, dialect=dialect) - 1 if not have_header: raise NamedAttributeButNoHeader() else: header = get_header(path=path, dialect=dialect) if len(header) == 0: raise EmptyHeader(path) return header[-1] == class_name
def append_column(input_path: str, output_path: str, column: Union[str, int], dialect: Dialect) -> None: """ Append a column into input_path, then dump the result file into output_path. """ if not is_an_int(column): index_column = find_index_with_class(path=input_path, class_name=column, dialect=dialect) append_column(input_path, output_path, index_column, dialect) else: content = get_csv_content(path=input_path, skip_header=False, dialect=dialect) # Move the column at the beginning for line in content: line.append(line.pop(column)) dump_csv_content(path=output_path, content=content, dialect=dialect)
def keep_distribution(input_reader, row_limit: int, out_writers, number_of_trees: int, class_name: Union[str, int], number_of_rows: int) -> List[int]: """ Splits a CSV file into multiple pieces with the `keep_distribution` method. The keep_distribution method regroup the database content into its class subgroup. then redistribute each instance with the same proportion as the initial content. """ rows_count = [0 for _ in range(number_of_trees)] if is_an_int(class_name): class_name = int(class_name) # We store rows into the distribution dictionary distribution_dictionary = dict() for row in input_reader: if row[class_name] in distribution_dictionary: distribution_dictionary[row[class_name]].append(row) else: distribution_dictionary[row[class_name]] = [row] # Then we distribute the rows proportionally percentage_per_db = row_limit / number_of_rows for class_name in distribution_dictionary.keys(): rows_to_give = int( round( len(distribution_dictionary[class_name]) * percentage_per_db)) for index, writer in enumerate(out_writers[:-1]): for _ in range(rows_to_give): try: writer.writerow(distribution_dictionary[class_name].pop(0)) rows_count[index] += 1 except IndexError: raise TooManyTreesToSplit(number_of_trees, number_of_rows, percentage_per_db) # Then the rest to the last writer for row in distribution_dictionary[class_name]: out_writers[-1].writerow(row) rows_count[-1] += 1 return rows_count
def keep_distribution2(input_reader, row_limit, out_writer_train, out_writer_test, class_name: Union[str, int], number_of_rows: int) -> Tuple[int, int]: """ Splits a CSV file into two pieces with the `keep_distribution` method. The keep_distribution method regroup the database content into its class subgroup. then redistribute each instance with the same proportion as the initial content. """ row_count_train, row_count_test = 0, 0 if is_an_int(class_name): class_name = int(class_name) # We store rows into the distribution dictionary distribution_dictionary = dict() for row in input_reader: if row[class_name] in distribution_dictionary: distribution_dictionary[row[class_name]].append(row) else: distribution_dictionary[row[class_name]] = [row] # Then we distribute the rows proportionally percentage_train = row_limit / number_of_rows for class_name in distribution_dictionary.keys(): # Distribute to train rows_to_give = int( round(len(distribution_dictionary[class_name]) * percentage_train)) row_count_train += rows_to_give for _ in range(rows_to_give): row_to_write = distribution_dictionary[class_name].pop(0) out_writer_train.writerow(row_to_write) # Then the rest to test for row in distribution_dictionary[class_name]: out_writer_test.writerow(row) row_count_test += 1 return row_count_train, row_count_test
def _clean_column_index_or_name(args: dict, param_name: str, column_name: str) -> None: """ If the specified name value is a column name, convert it to it's respective index. Otherwise, check if it's inbounds and convert it to an integer. """ if (not is_an_int(args[param_name])) and (type(args[param_name]) == str): # User asked for a named class, we retrieve its index then change it args[param_name] = find_index_with_class( path=args[gpn.database()], class_name=args[param_name], dialect=Dialect(encoding=args[gpn.encoding_input()], delimiter=args[gpn.delimiter_input()], quoting=args[gpn.quoting_input()], quote_char=args[gpn.quote_char_input()], skip_initial_space=True)) else: # User asked for an index, we convert it to int then check if it's inbound args[param_name] = int(args[param_name]) if not index_in_bounds(input_path=args[gpn.database()], index=args[param_name], dialect=Dialect( encoding=args[gpn.encoding_input()], delimiter=args[gpn.delimiter_input()], quoting=args[gpn.quoting_input()], quote_char=args[gpn.quote_char_input()], skip_initial_space=True)): raise IndexOutOfBounds( index=args[param_name], column=column_name, length=get_number_of_columns( path=args[gpn.database()], dialect=Dialect(encoding=args[gpn.encoding_input()], delimiter=args[gpn.delimiter_input()], quoting=args[gpn.quoting_input()], quote_char=args[gpn.quote_char_input()], skip_initial_space=True)))