Example #1
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names,
                                              self.args.zero_based,
                                              self.args.not_columns)
        output = agate.writer(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for row in rows:
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue

            output.writerow(out_row)
Example #2
0
    def test_string_match(self):
        args = [
            '-c', '1', '-m', 'ILLINOIS',
            'examples/realdata/FY09_EDU_Recipients_by_State.csv'
        ]
        output_file = six.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), [
            'State Name', 'State Abbreviate', 'Code',
            'Montgomery GI Bill-Active Duty',
            'Montgomery GI Bill- Selective Reserve',
            'Dependents\' Educational Assistance',
            'Reserve Educational Assistance Program',
            'Post-Vietnam Era Veteran\'s Educational Assistance Program',
            'TOTAL', ''
        ])
        self.assertEqual(next(reader), [
            'ILLINOIS', 'IL', '17', '15,659', '2,491', '2,025', '1,770', '19',
            '21,964', ''
        ])
Example #3
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)
        output = agate.writer(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for row in rows:
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue

            output.writerow(out_row)
Example #4
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.columns:
            self.argparser.error('You must specify at least one column to search using the -c option.')

        if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None:
            self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.')

        rows = agate.reader(self.input_file, **self.reader_kwargs)
        column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)

        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern

        patterns = dict((c, pattern) for c in column_ids)

        output = agate.writer(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for row in filter_reader:
            output.writerow(row)
Example #5
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if not self.input_files:
            self.argparser.error('You must specify at least one file to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.input_files]
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.input_files):
                self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.')
        else:
            groups = None

        group_name = self.args.group_name if self.args.group_name else 'group'

        output = agate.writer(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.input_files):
            rows = agate.reader(f, **self.reader_kwargs)

            # If we have header rows, use them
            if not self.args.no_header_row:
                headers = next(rows, [])

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)
            # If we don't generate simple column names based on first row
            else:
                row = next(rows, [])

                headers = make_default_headers(len(row))

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)

                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            f.close()
Example #6
0
    def main(self):
        rows = agate.reader(self.input_file, **self.reader_kwargs)

        # Make a default header row if none exists
        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_names = list(column_names)

        # prepend 'line_number' column with line numbers if --linenumbers option
        if self.args.line_numbers:
            column_names.insert(0, 'line_number')
            rows = [
                list(itertools.chain([str(i + 1)], row))
                for i, row in enumerate(rows)
            ]

        # Convert to normal list of rows
        rows = list(rows)

        # Insert the column names at the top
        rows.insert(0, column_names)

        widths = []

        for row in rows:
            for i, v in enumerate(row):
                try:
                    if len(v) > widths[i]:
                        widths[i] = len(v)
                except IndexError:
                    widths.append(len(v))

        # Dashes span each width with '+' character at intersection of
        # horizontal and vertical dividers.
        divider = '|--' + '-+-'.join('-' * w for w in widths) + '--|'

        self.output_file.write('%s\n' % divider)

        for i, row in enumerate(rows):
            output = []

            for j, d in enumerate(row):
                if d is None:
                    d = ''
                output.append(' %s ' % six.text_type(d).ljust(widths[j]))

            self.output_file.write('| %s |\n' % ('|'.join(output)))

            if (i == 0 or i == len(rows) - 1):
                self.output_file.write('%s\n' % divider)
Example #7
0
    def main(self):
        reader = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.dryrun:
            checker = RowChecker(reader)

            for row in checker.checked_rows():
                pass

            if checker.errors:
                for e in checker.errors:
                    self.output_file.write('Line %i: %s\n' %
                                           (e.line_number, e.msg))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write(
                    '%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n'
                    % (checker.rows_joined, checker.joins))
        else:
            base, ext = splitext(self.input_file.name)

            with open('%s_out.csv' % base, 'w') as f:
                clean_writer = agate.writer(f, **self.writer_kwargs)

                checker = RowChecker(reader)
                clean_writer.writerow(checker.column_names)

                for row in checker.checked_rows():
                    clean_writer.writerow(row)

            if checker.errors:
                error_filename = '%s_err.csv' % base

                with open(error_filename, 'w') as f:
                    error_writer = agate.writer(f, **self.writer_kwargs)

                    error_header = ['line_number', 'msg']
                    error_header.extend(checker.column_names)
                    error_writer.writerow(error_header)

                    error_count = len(checker.errors)

                    for e in checker.errors:
                        error_writer.writerow(self._format_error_row(e))

                self.output_file.write(
                    '%i error%s logged to %s\n' %
                    (error_count, '' if error_count == 1 else 's',
                     error_filename))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write(
                    '%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n'
                    % (checker.rows_joined, checker.joins))
Example #8
0
    def main(self):
        rows = agate.reader(self.input_file, **self.reader_kwargs)

        # Make a default header row if none exists
        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_names = list(column_names)

        # prepend 'line_number' column with line numbers if --linenumbers option
        if self.args.line_numbers:
            column_names.insert(0, 'line_number')
            rows = [list(itertools.chain([str(i + 1)], row)) for i, row in enumerate(rows)]


        # Convert to normal list of rows
        rows = list(rows)

        # Insert the column names at the top
        rows.insert(0, column_names)

        widths = []

        for row in rows:
            for i, v in enumerate(row):
                try:
                    if len(v) > widths[i]:
                        widths[i] = len(v)
                except IndexError:
                    widths.append(len(v))

        # Dashes span each width with '+' character at intersection of
        # horizontal and vertical dividers.
        divider = '|--' + '-+-'.join('-'* w for w in widths) + '--|'

        self.output_file.write('%s\n' % divider)

        for i, row in enumerate(rows):
            output = []

            for j, d in enumerate(row):
                if d is None:
                    d = ''
                output.append(' %s ' % six.text_type(d).ljust(widths[j]))

            self.output_file.write('| %s |\n' % ('|'.join(output)))

            if (i == 0 or i == len(rows) - 1):
                self.output_file.write('%s\n' % divider)
Example #9
0
    def test_no_match(self):
        args = ['-c', '1', '-m', 'NO MATCH', 'examples/dummy.csv']
        output_file = six.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['a', 'b', 'c'])
Example #10
0
    def __init__(self, schema):
        self.fields = []  # A list of FixedWidthFields

        schema_reader = agate.reader(schema)
        schema_decoder = SchemaDecoder(next(schema_reader))

        for i, row in enumerate(schema_reader):
            try:
                self.fields.append(schema_decoder(row))
            except Exception as e:
                raise ValueError("Error reading schema at line %i: %s" % (i + 2, e))
Example #11
0
    def test_no_match(self):
        args = ['-c', '1', '-m', 'NO MATCH', 'examples/dummy.csv']
        output_file = six.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['a', 'b', 'c'])
Example #12
0
    def main(self):
        reader = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.dryrun:
            checker = RowChecker(reader)

            for row in checker.checked_rows():
                pass

            if checker.errors:
                for e in checker.errors:
                    self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write('%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins))
        else:
            if self.input_file == sys.stdin:
                base = 'stdin'  # "<stdin>_out.csv" is invalid on Windows
            else:
                base = splitext(self.input_file.name)[0]

            with open('%s_out.csv' % base, 'w') as f:
                clean_writer = agate.writer(f, **self.writer_kwargs)

                checker = RowChecker(reader)
                clean_writer.writerow(checker.column_names)

                for row in checker.checked_rows():
                    clean_writer.writerow(row)

            if checker.errors:
                error_filename = '%s_err.csv' % base

                with open(error_filename, 'w') as f:
                    error_writer = agate.writer(f, **self.writer_kwargs)

                    error_header = ['line_number', 'msg']
                    error_header.extend(checker.column_names)
                    error_writer.writerow(error_header)

                    error_count = len(checker.errors)

                    for e in checker.errors:
                        error_writer.writerow(self._format_error_row(e))

                self.output_file.write('%i error%s logged to %s\n' % (error_count, '' if error_count == 1 else 's', error_filename))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write('%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins))
Example #13
0
    def test_exclude(self):
        args = ['-C', '1,3', 'examples/dummy.csv']
        output_file = six.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['b'])
        self.assertEqual(next(reader), ['2'])
Example #14
0
    def test_no_header_row(self):
        args = ['-c', '2', '--no-header-row', 'examples/no_header_row.csv']
        output_file = six.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['column2'])
        self.assertEqual(next(reader), ['2'])
Example #15
0
    def test_display_column_names(self):
        args = ['-n', 'examples/realdata/FY09_EDU_Recipients_by_State.csv']
        output_file = six.StringIO()

        utility = CSVGrep(args, output_file)
        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['  1: State Name'])
        self.assertEqual(next(reader), ['  2: State Abbreviate'])
Example #16
0
    def test_with_bzip2(self):
        args = ['-c', '1,3', 'examples/dummy.csv.bz2']
        output_file = six.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['a', 'c'])
        self.assertEqual(next(reader), ['1', '3'])
Example #17
0
    def test_string_match(self):
        args = ['-c', '1', '-m', 'ILLINOIS', 'examples/realdata/FY09_EDU_Recipients_by_State.csv']
        output_file = six.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['State Name', 'State Abbreviate', 'Code', 'Montgomery GI Bill-Active Duty', 'Montgomery GI Bill- Selective Reserve', 'Dependents\' Educational Assistance', 'Reserve Educational Assistance Program', 'Post-Vietnam Era Veteran\'s Educational Assistance Program', 'TOTAL', ''])
        self.assertEqual(next(reader), ['ILLINOIS', 'IL', '17', '15,659', '2,491', '2,025', '1,770', '19', '21,964', ''])
Example #18
0
    def test_no_header_row(self):
        args = ['-c', '2', '--no-header-row', 'examples/no_header_row.csv']
        output_file = six.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['column2'])
        self.assertEqual(next(reader), ['2'])
Example #19
0
    def test_display_column_names(self):
        args = ['-n', 'examples/realdata/FY09_EDU_Recipients_by_State.csv']
        output_file = six.StringIO()

        utility = CSVGrep(args, output_file)
        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['  1: State Name'])
        self.assertEqual(next(reader), ['  2: State Abbreviate'])
Example #20
0
    def test_with_bzip2(self):
        args = ['-c', '1,3', 'examples/dummy.csv.bz2']
        output_file = six.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['a', 'c'])
        self.assertEqual(next(reader), ['1', '3'])
Example #21
0
    def test_exclude(self):
        args = ['-C', '1,3', 'examples/dummy.csv']
        output_file = six.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['b'])
        self.assertEqual(next(reader), ['2'])
Example #22
0
    def test_unicode(self):
        args = ['-c', '1,3', 'examples/test_utf8.csv']
        output_file = six.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['a', 'c'])
        self.assertEqual(next(reader), ['1', '3'])
        self.assertEqual(next(reader), ['4', u'ʤ'])
Example #23
0
    def test_unicode(self):
        args = ['-c', '1,3', 'examples/test_utf8.csv']
        output_file = six.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['a', 'c'])
        self.assertEqual(next(reader), ['1', '3'])
        self.assertEqual(next(reader), ['4', u'ʤ'])
Example #24
0
    def test_no_inference(self):
        args = ['--no-inference', '-c', '1', 'examples/test_literal_order.csv']
        output_file = six.StringIO()
        utility = CSVSort(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        test_order = [u'a', u'192', u'27', u'3']
        new_order = [six.text_type(r[0]) for r in reader]

        self.assertEqual(test_order, new_order)
Example #25
0
    def test_no_header_row(self):
        args = ['--no-header-row', '-c', '1', '-r', 'examples/no_header_row3.csv']
        output_file = six.StringIO()
        utility = CSVSort(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        test_order = ['column1', '4', '1']
        new_order = [six.text_type(r[0]) for r in reader]

        self.assertEqual(test_order, new_order)
Example #26
0
    def test_no_header_row(self):
        args = ['--no-header-row', '-c', '1', '-r', 'examples/no_header_row3.csv']
        output_file = six.StringIO()
        utility = CSVSort(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        test_order = ['column1', '4', '1']
        new_order = [six.text_type(r[0]) for r in reader]

        self.assertEqual(test_order, new_order)
Example #27
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.columns:
            self.argparser.error(
                'You must specify at least one column to search using the -c option.'
            )

        if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None:
            self.argparser.error(
                'One of -r, -m or -f must be specified, unless using the -n option.'
            )

        rows = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names,
                                              self.args.zero_based)

        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern

        patterns = dict((c, pattern) for c in column_ids)

        output = agate.writer(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows,
                                           header=False,
                                           patterns=patterns,
                                           inverse=self.args.inverse)

        for row in filter_reader:
            output.writerow(row)
Example #28
0
File: cli.py Project: sukic/csvkit
    def get_rows_and_column_names_and_column_ids(self):
        rows = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            # Peek at a row to get the number of columns.
            row = next(rows)
            rows = itertools.chain([row], rows)
            column_names = make_default_headers(len(row))
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, getattr(self.args, 'not_columns', None))

        return rows, column_names, column_ids
Example #29
0
    def test_no_inference(self):
        args = ['--no-inference', '-c', '1', 'examples/test_literal_order.csv']
        output_file = six.StringIO()
        utility = CSVSort(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        test_order = [u'a', u'192', u'27', u'3']
        new_order = [six.text_type(r[0]) for r in reader]

        self.assertEqual(test_order, new_order)
Example #30
0
    def test_sort_date(self):
        args = ['-c', '2', 'examples/testxls_converted.csv']
        output_file = six.StringIO()
        utility = CSVSort(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        test_order = [u'text', u'This row has blanks', u'Unicode! Σ', u'Chicago Tribune', u'Chicago Sun-Times', u'Chicago Reader']
        new_order = [six.text_type(r[0]) for r in reader]

        self.assertEqual(test_order, new_order)
Example #31
0
    def test_sort_date(self):
        args = ['-c', '2', 'examples/testxls_converted.csv']
        output_file = six.StringIO()
        utility = CSVSort(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        test_order = [u'text', u'This row has blanks', u'Unicode! Σ', u'Chicago Tribune', u'Chicago Sun-Times', u'Chicago Reader']
        new_order = [six.text_type(r[0]) for r in reader]

        self.assertEqual(test_order, new_order)
Example #32
0
    def test_sort_ints_and_nulls(self):
        args = ['-c', '2', 'examples/sort_ints_nulls.csv']

        output_file = six.StringIO()
        utility = CSVSort(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        test_order = ['b', '', '1', '2']
        new_order = [six.text_type(r[1]) for r in reader]

        self.assertEqual(test_order, new_order)
Example #33
0
    def test_no_grouping(self):
        # stack two CSV files
        args = ["examples/dummy.csv", "examples/dummy2.csv"]
        output_file = six.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ["a", "b", "c"])
        self.assertEqual(next(reader)[0], "1")
        self.assertEqual(next(reader)[0], "1")
Example #34
0
    def test_no_header_row(self):
        # stack two CSV files
        args = ["--no-header-row", "examples/no_header_row.csv", "examples/no_header_row2.csv"]
        output_file = six.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader)[0], "column1")
        self.assertEqual(next(reader)[0], "1")
        self.assertEqual(next(reader)[0], "4")
Example #35
0
    def test_sort_ints_and_nulls(self):
        args = ['-c', '2', 'examples/sort_ints_nulls.csv']

        output_file = six.StringIO()
        utility = CSVSort(args, output_file)

        utility.main()

        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        test_order = ['b', '', '1', '2']
        new_order = [six.text_type(r[1]) for r in reader]

        self.assertEqual(test_order, new_order)
Example #36
0
    def test_filenames_grouping(self):
        # stack two CSV files
        args = ['--filenames', '-n', 'path', 'examples/dummy.csv', 'examples/dummy2.csv']
        output_file = six.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['path', 'a', 'b', 'c'])
        self.assertEqual(next(reader)[0], 'dummy.csv')
        self.assertEqual(next(reader)[0], 'dummy2.csv')
Example #37
0
    def test_no_grouping(self):
        # stack two CSV files
        args = ['examples/dummy.csv', 'examples/dummy2.csv']
        output_file = six.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['a', 'b', 'c'])
        self.assertEqual(next(reader)[0], '1')
        self.assertEqual(next(reader)[0], '1')
Example #38
0
    def test_single_file_stack(self):
        # stacking single file works fine
        args = ['examples/dummy.csv']

        output_file = six.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['a', 'b', 'c'])
        self.assertEqual(next(reader)[0], '1')
Example #39
0
    def test_single_file_stack(self):
        # stacking single file works fine
        args = ["examples/dummy.csv"]

        output_file = six.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ["a", "b", "c"])
        self.assertEqual(next(reader)[0], "1")
Example #40
0
    def test_multiple_file_stack(self):
        # stacking multiple files works fine
        args = ['examples/dummy.csv', 'examples/dummy2.csv']

        output_file = six.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader), ['a', 'b', 'c'])
        self.assertEqual(next(reader)[0], '1')
        self.assertEqual(next(reader)[0], '1')
Example #41
0
    def test_no_header_row(self):
        # stack two CSV files
        args = [
            '--no-header-row', 'examples/no_header_row.csv',
            'examples/no_header_row2.csv'
        ]
        output_file = six.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = six.StringIO(output_file.getvalue())
        reader = agate.reader(input_file)

        self.assertEqual(next(reader)[0], 'column1')
        self.assertEqual(next(reader)[0], '1')
        self.assertEqual(next(reader)[0], '4')
Example #42
0
    def get_rows_and_column_names_and_column_ids(self, **kwargs):
        rows = agate.reader(self.input_file, **kwargs)

        if self.args.no_header_row:
            # Peek at a row to get the number of columns.
            row = next(rows)
            rows = itertools.chain([row], rows)
            column_names = make_default_headers(len(row))
        else:
            column_names = next(rows)

        column_offset = self.get_column_offset()
        if self.args.line_numbers:
            column_offset -= 1

        column_ids = parse_column_identifiers(
            self.args.columns, column_names, column_offset,
            getattr(self.args, 'not_columns', None))

        return rows, column_names, column_ids
Example #43
0
File: cli.py Project: sukic/csvkit
    def print_column_names(self):
        """
        Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
        """
        if getattr(self.args, 'no_header_row', None):
            raise RequiredHeaderError('You cannot use --no-header-row with the -n or --names options.')

        f = self.input_file
        output = self.output_file

        try:
            zero_based = self.args.zero_based
        except:
            zero_based = False

        rows = agate.reader(f, **self.reader_kwargs)
        column_names = next(rows)

        for i, c in enumerate(column_names):
            if not zero_based:
                i += 1
            output.write('%3i: %s\n' % (i, c))
Example #44
0
    def print_column_names(self):
        """
        Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
        """
        if getattr(self.args, 'no_header_row', None):
            raise RequiredHeaderError(
                'You cannot use --no-header-row with the -n or --names options.'
            )

        f = self.input_file
        output = self.output_file

        try:
            zero_based = self.args.zero_based
        except:
            zero_based = False

        rows = agate.reader(f, **self.reader_kwargs)
        column_names = next(rows)

        for i, c in enumerate(column_names):
            if not zero_based:
                i += 1
            output.write('%3i: %s\n' % (i, c))
Example #45
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if len(self.input_files) < 2:
            self.argparser.error('You must specify at least two files to join.')

        if self.args.columns:
            join_column_names = self._parse_join_column_names(self.args.columns)

            if len(join_column_names) == 1:
                join_column_names = join_column_names * len(self.input_files)

            if len(join_column_names) != len(self.input_files):
                self.argparser.error('The number of join column names must match the number of files, or be a single column name that exists in all files.')

        if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns:
            self.argparser.error('You must provide join column names when performing an outer join.')

        if self.args.left_join and self.args.right_join:
             self.argparser.error('It is not valid to specify both a left and a right join.')

        tables = []

        for f in self.input_files:
            tables.append(list(agate.reader(f, **self.reader_kwargs)))
            f.close()

        join_column_ids = []

        if self.args.columns:
            for i, t in enumerate(tables):
                join_column_ids.append(match_column_identifier(t[0], join_column_names[i]))

        jointab = []

        if self.args.left_join:
            # Left outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
        elif self.args.right_join:
            # Right outer join
            jointab = tables[-1]

            remaining_tables = tables[:-1]
            remaining_tables.reverse()

            for i, t in enumerate(remaining_tables):
                jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1])
        elif self.args.outer_join:
            # Full outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
        else:
            if self.args.columns:
                # Inner join
                jointab = tables[0]

                for i, t in enumerate(tables[1:]):
                    jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
            else:
                jointab = tables[0]

                # Sequential join
                for t in tables[1:]:
                    jointab = join.sequential_join(jointab, t)

        output = agate.writer(self.output_file, **self.writer_kwargs)

        for row in jointab:
            output.writerow(row)
Example #46
0
    def main(self):
        operations = [
            op for op in OPERATIONS if getattr(self.args, op + '_only')
        ]

        if len(operations) > 1:
            self.argparser.error(
                'Only one statistic argument may be specified (mean, median, etc).'
            )

        if operations and self.args.count_only:
            self.argparser.error(
                'You may not specify --count and a statistical argument at the same time.'
            )

        if self.args.count_only:
            count = len(list(agate.reader(self.input_file)))

            if not self.args.no_header_row:
                count -= 1

            self.output_file.write('Row count: %i\n' % count)

            return

        tab = table.Table.from_csv(self.input_file,
                                   snifflimit=self.args.snifflimit,
                                   column_ids=self.args.columns,
                                   zero_based=self.args.zero_based,
                                   no_header_row=self.args.no_header_row,
                                   **self.reader_kwargs)

        for c in tab:
            values = sorted(filter(lambda i: i is not None, c))

            stats = {}

            # Output a single stat
            if len(operations) == 1:
                op = operations[0]
                stat = getattr(self, 'get_%s' % op)(c, values, {})

                # Formatting
                if op == 'unique':
                    stat = len(stat)
                elif op == 'freq':
                    stat = ', '.join([('"%s": %s' % (six.text_type(k), count))
                                      for k, count in stat])
                    stat = '{ %s }' % stat

                if len(tab) == 1:
                    self.output_file.write(six.text_type(stat))
                else:
                    self.output_file.write('%3i. %s: %s\n' %
                                           (c.order + 1, c.name, stat))
            # Output all stats
            else:
                for op in OPERATIONS:
                    stats[op] = getattr(self, 'get_%s' % op)(c, values, stats)

                self.output_file.write(('%3i. %s\n' % (c.order + 1, c.name)))

                if c.type == None:
                    self.output_file.write('\tEmpty column\n')
                    continue

                self.output_file.write('\t%s\n' % c.type)
                self.output_file.write('\tNulls: %s\n' % stats['nulls'])

                if len(stats['unique']) <= MAX_UNIQUE and c.type is not bool:
                    uniques = [six.text_type(u) for u in list(stats['unique'])]
                    data = u'\tValues: %s\n' % ', '.join(uniques)
                    self.output_file.write(data)
                else:
                    if c.type not in [six.text_type, bool]:
                        self.output_file.write('\tMin: %s\n' % stats['min'])
                        self.output_file.write('\tMax: %s\n' % stats['max'])

                        if c.type in [int, float]:
                            self.output_file.write('\tSum: %s\n' %
                                                   stats['sum'])
                            self.output_file.write('\tMean: %s\n' %
                                                   stats['mean'])
                            self.output_file.write('\tMedian: %s\n' %
                                                   stats['median'])
                            self.output_file.write(
                                '\tStandard Deviation: %s\n' % stats['stdev'])

                    self.output_file.write('\tUnique values: %i\n' %
                                           len(stats['unique']))

                    if len(stats['unique']) != len(values):
                        self.output_file.write('\t%i most frequent values:\n' %
                                               MAX_FREQ)
                        for value, count in stats['freq']:
                            self.output_file.write(
                                ('\t\t%s:\t%s\n' %
                                 (six.text_type(value), count)))

                    if c.type == six.text_type:
                        self.output_file.write('\tMax length: %i\n' %
                                               stats['len'])

        if not operations:
            self.output_file.write('\n')
            self.output_file.write('Row count: %s\n' % tab.count_rows())
Example #47
0
    def main(self):
        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' %
                                     self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error(
                    'You must specify a format when providing data via STDIN (pipe).'
                )
            filetype = convert.guess_format(self.args.input_path)
            if not filetype:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying a format with --format.'
                )

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            # Streaming CSV musn't set sniff_limit, but non-streaming should.
            if not self.args.no_inference:
                kwargs['sniff_limit'] = self.args.sniff_limit
            if self.args.no_header_row:
                kwargs['header'] = False
        elif self.args.no_inference:
            # Streaming CSV musn't set column_types, but other formats should.
            kwargs['column_types'] = agate.TypeTester(limit=0)

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference:
            reader = agate.reader(self.input_file, **self.reader_kwargs)
            writer = agate.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(
                fixed2csv(self.input_file,
                          schema,
                          output=self.output_file,
                          **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              newline=True,
                                              **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file,
                                             sheet=kwargs.get('sheet', None))
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file,
                                              sheet=kwargs.get('sheet', None))
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError(
                        'DBF files can not be converted from stdin. You must pass a filename.'
                    )
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file)
Example #48
0
    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # snifflimit == 0 means do not sniff
        if snifflimit is None:
            kwargs['dialect'] = sniff_dialect(contents)
        elif snifflimit > 0:
            kwargs['dialect'] = sniff_dialect(contents[:snifflimit])

        f = six.StringIO(contents)
        rows = agate.reader(f, **kwargs)

        try:
            if no_header_row:
                # Peek at a row to infer column names from, and put it back on top
                row = next(rows)
                rows = itertools.chain([row], rows)
                headers = make_default_headers(len(row))
            else:
                headers = next(rows)
        except StopIteration:
            # The file is `/dev/null`.
            headers = []
            pass

        if no_header_row or column_ids:
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))

        data_columns = [[] for c in headers]
        width = len(data_columns)

        for i, row in enumerate(rows):
            j = 0

            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

            j += 1

            # Populate remaining columns with None
            while j < width:
                data_columns[j].append(None)

                j += 1

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types))

        return Table(columns, name=name)
Example #49
0
 def get_output_as_reader(self, args):
     return agate.reader(self.get_output_as_io(args))
Example #50
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if not self.input_files:
            self.argparser.error(
                'You must specify at least one file to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.input_files]
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.input_files):
                self.argparser.error(
                    'The number of grouping values must be equal to the number of CSV files being stacked.'
                )
        else:
            groups = None

        group_name = self.args.group_name if self.args.group_name else 'group'

        output = agate.writer(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.input_files):
            rows = agate.reader(f, **self.reader_kwargs)

            # If we have header rows, use them
            if not self.args.no_header_row:
                headers = next(rows, [])

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)
            # If we don't generate simple column names based on first row
            else:
                row = next(rows, [])

                headers = make_default_headers(len(row))

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)

                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            f.close()
Example #51
0
    def main(self):
        reader = agate.reader(self.input_file, **self.reader_kwargs)

        writer = agate.writer(self.output_file, **self.writer_kwargs)

        writer.writerows(reader)
Example #52
0
    def main(self):
        if six.PY2:
            stream = codecs.getwriter('utf-8')(self.output_file)
        else:
            stream = self.output_file

        json_kwargs = {
            'ensure_ascii': False,
            'indent': self.args.indent,
        }

        if six.PY2:
            json_kwargs['encoding'] = 'utf-8'

        def dump_json(data, newline=False):
            json.dump(data, stream, **json_kwargs)
            if newline:
                stream.write("\n")

        """
        Convert CSV to JSON.
        """
        if self.args.lat and not self.args.lon:
            self.argparser.error('--lon is required whenever --lat is specified.')

        if self.args.lon and not self.args.lat:
            self.argparser.error('--lat is required whenever --lon is specified.')

        if self.args.crs and not self.args.lat:
            self.argparser.error('--crs is only allowed when --lat and --lon are also specified.')

        if self.args.streamOutput and (self.args.lat or self.args.lon or self.args.key):
            self.argparser.error('--stream is only allowed if --lat, --lon and --key are not specified.')

        # GeoJSON
        if self.args.lat and self.args.lon:
            rows = agate.reader(self.input_file, **self.reader_kwargs)
            column_names = next(rows)

            features = []
            min_lon = None
            min_lat = None
            max_lon = None
            max_lat = None

            lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based)
            lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based)

            if self.args.key:
                id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based)
            else:
                id_column = None

            for row in rows:
                feature = OrderedDict()
                feature['type'] = 'Feature'
                properties = OrderedDict()
                geoid = None
                lat = None
                lon = None

                for i, c in enumerate(row):
                    if i == lat_column:
                        try:
                            lat = float(c)
                        except ValueError:
                            lat = None

                        if min_lat is None or lat < min_lat:
                            min_lat = lat

                        if max_lat is None or lat > max_lat:
                            max_lat = lat
                    elif i == lon_column:
                        try:
                            lon = float(c)
                        except ValueError:
                            lon = None

                        if min_lon is None or lon < min_lon:
                            min_lon = lon

                        if max_lon is None or lon > max_lon:
                            max_lon = lon
                    elif id_column is not None and i == id_column:
                        geoid = c
                    else:
                        properties[column_names[i]] = c

                if id_column is not None:
                    feature['id'] = geoid

                feature['geometry'] = OrderedDict([
                    ('type', 'Point'),
                    ('coordinates', [lon, lat])
                ])

                feature['properties'] = properties

                features.append(feature)

            output = OrderedDict([
                ('type', 'FeatureCollection'),
                ('bbox', [min_lon, min_lat, max_lon, max_lat]),
                ('features', features)
            ])

            if self.args.crs:
                output['crs'] = OrderedDict([
                    ('type', 'name'),
                    ('properties', {
                        'name': self.args.crs
                    })
                ])
            dump_json(output)
        elif self.args.streamOutput and self.args.no_inference:
            rows = agate.reader(self.input_file, **self.reader_kwargs)
            column_names = next(rows)

            for row in rows:
                data = OrderedDict()
                for i, column in enumerate(column_names):
                    try:
                        data[column] = row[i]
                    except IndexError:
                        data[column] = None
                dump_json(data, newline=True)
        else:
            table = agate.Table.from_csv(self.input_file, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types())
            table.to_json(stream, key=self.args.key, newline=self.args.streamOutput, indent=self.args.indent)
Example #53
0
    def main(self):
        if six.PY2:
            stream = codecs.getwriter('utf-8')(self.output_file)
        else:
            stream = self.output_file

        json_kwargs = {
            'ensure_ascii': False,
            'indent': self.args.indent,
        }

        if six.PY2:
            json_kwargs['encoding'] = 'utf-8'

        def dump_json(data, newline=False):
            json.dump(data, stream, **json_kwargs)
            if newline:
                stream.write("\n")

        """
        Convert CSV to JSON.
        """
        if self.args.lat and not self.args.lon:
            self.argparser.error(
                '--lon is required whenever --lat is specified.')

        if self.args.lon and not self.args.lat:
            self.argparser.error(
                '--lat is required whenever --lon is specified.')

        if self.args.crs and not self.args.lat:
            self.argparser.error(
                '--crs is only allowed when --lat and --lon are also specified.'
            )

        if self.args.streamOutput and (self.args.lat or self.args.lon
                                       or self.args.key):
            self.argparser.error(
                '--stream is only allowed if --lat, --lon and --key are not specified.'
            )

        rows = agate.reader(self.input_file, **self.reader_kwargs)
        column_names = next(rows)

        # GeoJSON
        if self.args.lat and self.args.lon:
            features = []
            min_lon = None
            min_lat = None
            max_lon = None
            max_lat = None

            lat_column = match_column_identifier(column_names, self.args.lat,
                                                 self.args.zero_based)
            lon_column = match_column_identifier(column_names, self.args.lon,
                                                 self.args.zero_based)

            if self.args.key:
                id_column = match_column_identifier(column_names,
                                                    self.args.key,
                                                    self.args.zero_based)
            else:
                id_column = None

            for row in rows:
                feature = OrderedDict()
                feature['type'] = 'Feature'
                properties = OrderedDict()
                geoid = None
                lat = None
                lon = None

                for i, c in enumerate(row):
                    if i == lat_column:
                        try:
                            lat = float(c)
                        except ValueError:
                            lat = None

                        if min_lat is None or lat < min_lat:
                            min_lat = lat

                        if max_lat is None or lat > max_lat:
                            max_lat = lat
                    elif i == lon_column:
                        try:
                            lon = float(c)
                        except ValueError:
                            lon = None

                        if min_lon is None or lon < min_lon:
                            min_lon = lon

                        if max_lon is None or lon > max_lon:
                            max_lon = lon
                    elif id_column is not None and i == id_column:
                        geoid = c
                    else:
                        properties[column_names[i]] = c

                if id_column is not None:
                    feature['id'] = geoid

                feature['geometry'] = OrderedDict([('type', 'Point'),
                                                   ('coordinates', [lon,
                                                                    lat])])

                feature['properties'] = properties

                features.append(feature)

            output = OrderedDict([('type', 'FeatureCollection'),
                                  ('bbox',
                                   [min_lon, min_lat, max_lon, max_lat]),
                                  ('features', features)])

            if self.args.crs:
                output['crs'] = OrderedDict([('type', 'name'),
                                             ('properties', {
                                                 'name': self.args.crs
                                             })])
            dump_json(output)
        # Keyed JSON
        elif self.args.key:
            output = OrderedDict()

            for row in rows:
                data = OrderedDict()

                for i, column in enumerate(column_names):
                    data[column] = row[i]

                k = data[self.args.key]

                if k in output:
                    raise NonUniqueKeyColumnException(
                        'Value %s is not unique in the key column.' %
                        six.text_type(k))

                output[k] = data
            dump_json(output)
        # Boring JSON
        else:
            output = []
            for row in rows:
                data = OrderedDict()

                for i, column in enumerate(column_names):
                    try:
                        data[column] = row[i]
                    except IndexError:
                        data[column] = None
                if (self.args.streamOutput):
                    dump_json(data, newline=True)
                else:
                    output.append(data)
            if not self.args.streamOutput:
                dump_json(output)
Example #54
0
    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # snifflimit == 0 means do not sniff
        if snifflimit is None:
            kwargs['dialect'] = sniffer.sniff_dialect(contents)
        elif snifflimit > 0:
            kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit])

        f = six.StringIO(contents)
        rows = agate.reader(f, **kwargs)

        try:
            if no_header_row:
                # Peek at a row to infer column names from, and put it back on top
                row = next(rows)
                rows = itertools.chain([row], rows)
                headers = make_default_headers(len(row))
            else:
                headers = next(rows)
        except StopIteration:
            # The file is `/dev/null`.
            headers = []
            pass

        if no_header_row or column_ids:
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))

        data_columns = [[] for c in headers]
        width = len(data_columns)

        for i, row in enumerate(rows):
            j = 0

            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

            j += 1

            # Populate remaining columns with None
            while j < width:
                data_columns[j].append(None)

                j += 1

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types))

        return Table(columns, name=name)
Example #55
0
 def main(self):
     reader = agate.reader(self.input_file, **self.reader_kwargs)
     writer = agate.writer(self.output_file, **self.writer_kwargs)
     writer.writerows(reader)
Example #56
0
    def main(self):
        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error('You must specify a format when providing data via STDIN (pipe).')
            filetype = convert.guess_format(self.args.input_path)
            if not filetype:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            # Streaming CSV musn't set sniff_limit, but non-streaming should.
            if not self.args.no_inference:
                kwargs['sniff_limit'] = self.args.sniff_limit
            if self.args.no_header_row:
                kwargs['header'] = False
        elif self.args.no_inference:
            # Streaming CSV musn't set column_types, but other formats should.
            kwargs['column_types'] = agate.TypeTester(limit=0)

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference:
            reader = agate.reader(self.input_file, **self.reader_kwargs)
            writer = agate.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet', None))
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet', None))
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError('DBF files can not be converted from stdin. You must pass a filename.')
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file)
Example #57
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        operations = [op for op in OPERATIONS if getattr(self.args, op + '_only')]

        if len(operations) > 1:
            self.argparser.error('Only one statistic argument may be specified (mean, median, etc).')

        if operations and self.args.count_only:
            self.argparser.error('You may not specify --count and a statistical argument at the same time.')

        if self.args.count_only:
            count = len(list(agate.reader(self.input_file)))

            if not self.args.no_header_row:
                count -= 1

            self.output_file.write('Row count: %i\n' % count)

            return

        tab = table.Table.from_csv(
            self.input_file,
            sniff_limit=self.args.sniff_limit,
            column_ids=self.args.columns,
            zero_based=self.args.zero_based,
            no_header_row=self.args.no_header_row,
            **self.reader_kwargs
        )

        for c in tab:
            values = sorted(filter(lambda i: i is not None, c))

            stats = {}

            # Output a single stat
            if len(operations) == 1:
                op = operations[0]
                stat = getattr(self, 'get_%s' % op)(c, values, {})

                # Formatting
                if op == 'unique':
                    stat = len(stat)
                elif op == 'freq':
                    stat = ', '.join([('"%s": %s' % (six.text_type(k), count)) for k, count in stat])
                    stat = '{ %s }' % stat

                if len(tab) == 1:
                    self.output_file.write(six.text_type(stat))
                else:
                    self.output_file.write('%3i. %s: %s\n' % (c.order + 1, c.name, stat))
            # Output all stats
            else:
                for op in OPERATIONS:
                    stats[op] = getattr(self, 'get_%s' % op)(c, values, stats)

                self.output_file.write(('%3i. %s\n' % (c.order + 1, c.name)))

                if c.type is None:
                    self.output_file.write('\tEmpty column\n')
                    continue

                self.output_file.write('\t%s\n' % c.type)
                self.output_file.write('\tNulls: %s\n' % stats['nulls'])

                if len(stats['unique']) <= MAX_UNIQUE and c.type is not bool:
                    uniques = [six.text_type(u) for u in list(stats['unique'])]
                    data = u'\tValues: %s\n' % ', '.join(uniques)
                    self.output_file.write(data)
                else:
                    if c.type not in [six.text_type, bool]:
                        self.output_file.write('\tMin: %s\n' % stats['min'])
                        self.output_file.write('\tMax: %s\n' % stats['max'])

                        if c.type in [int, float]:
                            self.output_file.write('\tSum: %s\n' % stats['sum'])
                            self.output_file.write('\tMean: %s\n' % stats['mean'])
                            self.output_file.write('\tMedian: %s\n' % stats['median'])
                            self.output_file.write('\tStandard Deviation: %s\n' % stats['stdev'])

                    self.output_file.write('\tUnique values: %i\n' % len(stats['unique']))

                    if len(stats['unique']) != len(values):
                        self.output_file.write('\t%i most frequent values:\n' % MAX_FREQ)
                        for value, count in stats['freq']:
                            self.output_file.write(('\t\t%s:\t%s\n' % (six.text_type(value), count)))

                    if c.type == six.text_type:
                        self.output_file.write('\tMax length: %i\n' % stats['len'])

        if not operations:
            self.output_file.write('\n')
            self.output_file.write('Row count: %s\n' % tab.count_rows())