Beispiel #1
0
def _identifier_at_beginning(path: str, identifier: str, have_header: bool, dialect: Dialect) -> bool:
    """ Check if the identifier column is at the beginning of the database. """
    if is_an_int(identifier):
        return int(identifier) == 0

    if not have_header:
        raise NamedAttributeButNoHeader()
    else:
        header = get_header(path=path, dialect=dialect)
        if len(header) == 0:
            raise EmptyHeader(path)
        return header[0] == identifier
Beispiel #2
0
def _class_at_end(path: str, class_name: str, have_header: bool, dialect: Dialect) -> bool:
    """ Check if the class column is at the end of the database. """
    if is_an_int(class_name):
        return int(class_name) == -1 or \
               int(class_name) == get_number_of_columns(path, dialect=dialect) - 1

    if not have_header:
        raise NamedAttributeButNoHeader()
    else:
        header = get_header(path=path, dialect=dialect)
        if len(header) == 0:
            raise EmptyHeader(path)
        return header[-1] == class_name
Beispiel #3
0
def append_column(input_path: str, output_path: str, column: Union[str, int],
                  dialect: Dialect) -> None:
    """ Append a column into input_path, then dump the result file into output_path. """
    if not is_an_int(column):
        index_column = find_index_with_class(path=input_path,
                                             class_name=column,
                                             dialect=dialect)
        append_column(input_path, output_path, index_column, dialect)
    else:
        content = get_csv_content(path=input_path,
                                  skip_header=False,
                                  dialect=dialect)
        # Move the column at the beginning
        for line in content:
            line.append(line.pop(column))
        dump_csv_content(path=output_path, content=content, dialect=dialect)
Beispiel #4
0
def keep_distribution(input_reader, row_limit: int, out_writers,
                      number_of_trees: int, class_name: Union[str, int],
                      number_of_rows: int) -> List[int]:
    """ Splits a CSV file into multiple pieces with the `keep_distribution` method.
    The keep_distribution method regroup the database content into its class subgroup. then redistribute each instance
    with the same proportion as the initial content.
    """
    rows_count = [0 for _ in range(number_of_trees)]

    if is_an_int(class_name):
        class_name = int(class_name)

    # We store rows into the distribution dictionary
    distribution_dictionary = dict()
    for row in input_reader:
        if row[class_name] in distribution_dictionary:
            distribution_dictionary[row[class_name]].append(row)
        else:
            distribution_dictionary[row[class_name]] = [row]

    # Then we distribute the rows proportionally
    percentage_per_db = row_limit / number_of_rows
    for class_name in distribution_dictionary.keys():
        rows_to_give = int(
            round(
                len(distribution_dictionary[class_name]) * percentage_per_db))
        for index, writer in enumerate(out_writers[:-1]):
            for _ in range(rows_to_give):
                try:
                    writer.writerow(distribution_dictionary[class_name].pop(0))
                    rows_count[index] += 1
                except IndexError:
                    raise TooManyTreesToSplit(number_of_trees, number_of_rows,
                                              percentage_per_db)

        # Then the rest to the last writer
        for row in distribution_dictionary[class_name]:
            out_writers[-1].writerow(row)
            rows_count[-1] += 1

    return rows_count
Beispiel #5
0
def keep_distribution2(input_reader, row_limit, out_writer_train,
                       out_writer_test, class_name: Union[str, int],
                       number_of_rows: int) -> Tuple[int, int]:
    """ Splits a CSV file into two pieces with the `keep_distribution` method.
    The keep_distribution method regroup the database content into its class subgroup. then redistribute each instance
    with the same proportion as the initial content.
    """
    row_count_train, row_count_test = 0, 0

    if is_an_int(class_name):
        class_name = int(class_name)

    # We store rows into the distribution dictionary
    distribution_dictionary = dict()
    for row in input_reader:
        if row[class_name] in distribution_dictionary:
            distribution_dictionary[row[class_name]].append(row)
        else:
            distribution_dictionary[row[class_name]] = [row]

    # Then we distribute the rows proportionally
    percentage_train = row_limit / number_of_rows
    for class_name in distribution_dictionary.keys():
        # Distribute to train
        rows_to_give = int(
            round(len(distribution_dictionary[class_name]) * percentage_train))
        row_count_train += rows_to_give
        for _ in range(rows_to_give):
            row_to_write = distribution_dictionary[class_name].pop(0)
            out_writer_train.writerow(row_to_write)

        # Then the rest to test
        for row in distribution_dictionary[class_name]:
            out_writer_test.writerow(row)
            row_count_test += 1

    return row_count_train, row_count_test
Beispiel #6
0
def _clean_column_index_or_name(args: dict, param_name: str,
                                column_name: str) -> None:
    """ If the specified name value is a column name, convert it to it's respective index. Otherwise, check if it's
    inbounds and convert it to an integer.
    """
    if (not is_an_int(args[param_name])) and (type(args[param_name]) == str):
        # User asked for a named class, we retrieve its index then change it
        args[param_name] = find_index_with_class(
            path=args[gpn.database()],
            class_name=args[param_name],
            dialect=Dialect(encoding=args[gpn.encoding_input()],
                            delimiter=args[gpn.delimiter_input()],
                            quoting=args[gpn.quoting_input()],
                            quote_char=args[gpn.quote_char_input()],
                            skip_initial_space=True))
    else:
        # User asked for an index, we convert it to int then check if it's inbound
        args[param_name] = int(args[param_name])
        if not index_in_bounds(input_path=args[gpn.database()],
                               index=args[param_name],
                               dialect=Dialect(
                                   encoding=args[gpn.encoding_input()],
                                   delimiter=args[gpn.delimiter_input()],
                                   quoting=args[gpn.quoting_input()],
                                   quote_char=args[gpn.quote_char_input()],
                                   skip_initial_space=True)):
            raise IndexOutOfBounds(
                index=args[param_name],
                column=column_name,
                length=get_number_of_columns(
                    path=args[gpn.database()],
                    dialect=Dialect(encoding=args[gpn.encoding_input()],
                                    delimiter=args[gpn.delimiter_input()],
                                    quoting=args[gpn.quoting_input()],
                                    quote_char=args[gpn.quote_char_input()],
                                    skip_initial_space=True)))