Example #1
0
def run(args):
    LOGGER.debug('args: %s', args)
    process_args(args)
    ext = get_ext(args.input)
    proportions = extract_proportions_from_args(args)
    output_filenames = output_filenames_for_names(
        [name for name, _ in proportions], args.out, ext)

    LOGGER.info('proportions: %s', proportions)
    LOGGER.info('output_filenames: %s', output_filenames)

    delimiter = csv_delimiter_by_filename(args.input)

    header_row, data_rows = read_csv_with_header(args.input, delimiter,
                                                 args.no_header)
    LOGGER.info('number of rows: %d', len(data_rows))

    if args.random:
        shuffle(data_rows)

    existing_file_sets = load_file_sets_or_none(output_filenames, delimiter,
                                                args.no_header)

    data_rows_by_set = split_rows(data_rows, [p for _, p in proportions],
                                  fill=args.fill,
                                  existing_split=existing_file_sets
                                  if not args.no_extend_existing else None)

    if existing_file_sets:
        backup_suffix = get_backup_file_suffix()
        save_file_sets([s + backup_suffix for s in output_filenames],
                       delimiter, header_row, existing_file_sets)

    save_file_sets(output_filenames, delimiter, header_row, data_rows_by_set)
 def __init__(self, fp, filename=None, fields=None):
     self.fp = fp
     self.fields = fields or DEFAULT_MATCH_DEBUG_COLUMNS
     self.writer = csv.writer(fp,
                              delimiter=csv_delimiter_by_filename(filename))
     self.writer.writerow(self.fields)
     self.id = 1
Example #3
0
def save_file_pairs_to_csv(output_path, source_xml_pairs):
    mkdirs_if_not_exists(dirname(output_path))
    delimiter = csv_delimiter_by_filename(output_path)
    mime_type = 'text/tsv' if delimiter == '\t' else 'text/csv'
    with open_file(output_path, 'w', mime_type=mime_type) as f:
        writer = csv.writer(f, delimiter=text_type(delimiter))
        write_csv_rows(writer, [['source_url', 'xml_url']])
        write_csv_rows(writer, source_xml_pairs)
    LOGGER.info('written results to %s', output_path)
Example #4
0
 def __init__(self, filename, header=True, limit=None):
     super(ReadDictCsv, self).__init__()
     if not header:
         raise RuntimeError('header required')
     self.filename = filename
     self.columns = None
     self.delimiter = csv_delimiter_by_filename(filename)
     self.limit = limit
     self.row_num = 0
Example #5
0
def load_csv_or_tsv_file_list(file_list_path, column, header=True, limit=None):
    delimiter = csv_delimiter_by_filename(file_list_path)
    with open_file(file_list_path, 'r') as f:
        reader = csv.reader(f, delimiter=text_type(delimiter))
        if not header:
            assert isinstance(column, int)
            column_index = column
        else:
            header_row = next(reader)
            if isinstance(column, int):
                column_index = column
            else:
                try:
                    column_index = header_row.index(column)
                except ValueError as exc:
                    raise ValueError(
                        'column %s not found, available columns: %s' %
                        (column, header_row)) from exc
        lines = (x[column_index] for x in reader)
        if limit:
            lines = islice(lines, 0, limit)
        return list(lines)
Example #6
0
 def __init__(self, path, columns, file_name_suffix=None):
     super(WriteDictCsv, self).__init__()
     self.path = path
     self.columns = columns
     self.file_name_suffix = file_name_suffix
     self.delimiter = csv_delimiter_by_filename(path + file_name_suffix)