Esempio n. 1
0
def convert(f, format, schema=None, key=None, **kwargs):
    """
    Convert a file of a specified format to CSV.
    """
    if not f:
        raise ValueError('f must not be None')

    if not format:
        raise ValueError('format must not be None')

    if format == 'fixed':
        if not schema:
            raise ValueError('schema must not be null when format is "fixed"')

        return fixed2csv(f, schema, **kwargs)
    elif format == 'xls':
        return xls2csv(f, **kwargs)
    elif format == 'xlsx':
        return xlsx2csv(f, **kwargs)
    elif format == 'json':
        return json2csv(f, key, **kwargs)
    elif format == 'ndjson':
        return ndjson2csv(f, **kwargs)
    elif format == 'geojson':
        return geojson2csv(f, **kwargs)
    elif format == 'csv':
        return csv2csv(f, **kwargs)
    elif format == 'dbf':
        if six.PY3:
            raise ValueError('format "dbf" is not supported forthis version of Python.')
        return dbf2csv(f, **kwargs)
    else:
        raise ValueError('format "%s" is not supported' % format)
Esempio n. 2
0
def convert(f, format, schema=None, key=None, **kwargs):
    """
    Convert a file of a specified format to CSV.
    """
    if not f:
        raise ValueError('f must not be None')

    if not format:
        raise ValueError('format must not be None')

    if format == 'fixed':
        if not schema:
            raise ValueError('schema must not be null when format is "fixed"')

        return fixed2csv(f, schema, **kwargs)
    elif format == 'xls':
        return xls2csv(f, **kwargs)
    elif format == 'xlsx':
        return xlsx2csv(f, **kwargs)
    elif format == 'json':
        return json2csv(f, key, **kwargs)
    elif format == 'ndjson':
        return ndjson2csv(f, **kwargs)
    elif format == 'geojson':
        return geojson2csv(f, **kwargs)
    elif format == 'csv':
        return csv2csv(f, **kwargs)
    elif format == 'dbf':
        if six.PY3:
            raise ValueError(
                'format "dbf" is not supported forthis version of Python.')
        return dbf2csv(f, **kwargs)
    else:
        raise ValueError('format "%s" is not supported' % format)
Esempio n. 3
0
    def test_geojson(self):
        with open("examples/test_geojson.json", "rt") as f:
            output = geojs.geojson2csv(f)

        self.assertIn("id,prop0,prop1,geojson", output)
        self.assertIn('""coordinates"": [102.0, 0.5]', output)
        self.assertIn('""coordinates"": [[102.0, 0.0], [103.0, 1.0], [104.0, 0.0], [105.0, 1.0]]', output)
Esempio n. 4
0
def convert(f, format, schema=None, key=None, output=None, **kwargs):
    """
    Convert a file of a specified format to CSV.
    """
    if format == 'fixed':
        if not schema:
            raise ValueError('schema must not be null when format is "fixed"')
        output.write(fixed2csv(f, schema, output=output, **kwargs))
    elif format == 'geojson':
        output.write(geojson2csv(f, **kwargs))
    elif format in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
        if format == 'csv':
            table = agate.Table.from_csv(f, **kwargs)
        elif format == 'json':
            table = agate.Table.from_json(f, key=key, **kwargs)
        elif format == 'ndjson':
            table = agate.Table.from_json(f, key=key, newline=True, **kwargs)
        elif format == 'xls':
            table = agate.Table.from_xls(f, sheet=kwargs.get('sheet', None))
        elif format == 'xlsx':
            table = agate.Table.from_xlsx(f, sheet=kwargs.get('sheet', None))
        elif format == 'dbf':
            with dbf.Table(f.name) as db:
                column_names = db.field_names
                table = agate.Table(db, column_names)
        table.to_csv(output)
    else:
        raise ValueError('format "%s" is not supported' % format)
Esempio n. 5
0
    def test_geojson(self):
        with open('examples/test_geojson.json', 'rt') as f:
            output = geojs.geojson2csv(f)

        self.assertIn('id,prop0,prop1,geojson', output)
        self.assertIn('""coordinates"": [102.0, 0.5]', output)
        self.assertIn(
            '""coordinates"": [[102.0, 0.0], [103.0, 1.0], [104.0, 0.0], [105.0, 1.0]]',
            output)
Esempio n. 6
0
    def main(self):
        path = self.args.input_path

        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not path or path == '-':
                self.argparser.error(
                    'You must specify a format when providing input as piped data via STDIN.'
                )
            filetype = convert.guess_format(path)
            if not filetype:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying '
                    'a format with --format.')

        if self.args.names_only:
            if filetype in ('xls', 'xlsx'):
                sheets = self.sheet_names(path, filetype)
                for sheet in sheets:
                    self.output_file.write('%s\n' % sheet)
            else:
                self.argparser.error(
                    'You cannot use the -n or --names options with non-Excel files.'
                )
            return

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = self.open_excel_input_file(path)
        else:
            self.input_file = self._open_input_file(path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            kwargs['sniff_limit'] = self.args.sniff_limit

        if filetype in ('xls', 'xlsx'):
            kwargs['header'] = not self.args.no_header_row

        if filetype not in ('dbf', 'geojson', 'json',
                            'ndjson'):  # csv, fixed, xls, xlsx
            kwargs['skip_lines'] = self.args.skip_lines

        if filetype != 'dbf':
            kwargs['column_types'] = self.get_column_types()

        # Convert the file.
        if (filetype == 'csv' and self.args.no_inference
                and not self.args.no_header_row and not self.args.skip_lines
                and self.args.sniff_limit == 0):
            reader = agate.csv.reader(self.input_file, **self.reader_kwargs)
            writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(
                fixed2csv(self.input_file,
                          schema,
                          output=self.output_file,
                          **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              newline=True,
                                              **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(
                    self.input_file,
                    sheet=self.args.sheet,
                    encoding_override=self.args.encoding_xls,
                    **kwargs)
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file,
                                              sheet=self.args.sheet,
                                              **kwargs)
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError(
                        'DBF files can not be converted from stdin. You must pass a filename.'
                    )
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file, **self.writer_kwargs)

        if self.args.write_sheets:
            # Close and re-open the file, as the file object has been mutated or closed.
            self.input_file.close()

            self.input_file = self.open_excel_input_file(path)

            if self.args.write_sheets == '-':
                sheets = self.sheet_names(path, filetype)
            else:
                sheets = [
                    int(sheet) if sheet.isdigit() else sheet
                    for sheet in self.args.write_sheets.split(',')
                ]

            if filetype == 'xls':
                tables = agate.Table.from_xls(
                    self.input_file,
                    sheet=sheets,
                    encoding_override=self.args.encoding_xls,
                    **kwargs)
            elif filetype == 'xlsx':
                tables = agate.Table.from_xlsx(self.input_file,
                                               sheet=sheets,
                                               **kwargs)

            base = splitext(self.input_file.name)[0]
            for i, table in enumerate(tables.values()):
                with open('%s_%d.csv' % (base, i), 'w') as f:
                    table.to_csv(f, **self.writer_kwargs)

        self.input_file.close()

        if self.args.schema:
            schema.close()
Esempio n. 7
0
    def main(self):
        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' %
                                     self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error(
                    'You must specify a format when providing data via STDIN (pipe).'
                )
            filetype = convert.guess_format(self.args.input_path)
            if not filetype:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying a format with --format.'
                )

        self.buffers_input = filetype == 'csv' or not self.args.no_inference

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            # Streaming CSV musn't set sniff_limit, but non-streaming should.
            if not self.args.no_inference:
                kwargs['sniff_limit'] = self.args.sniff_limit
            if self.args.no_header_row:
                kwargs['header'] = False
        elif self.args.no_inference:
            # Streaming CSV musn't set column_types, but other formats should.
            kwargs['column_types'] = agate.TypeTester(limit=0)

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference:
            reader = agate.csv.reader(self.input_file, **self.reader_kwargs)
            writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(
                fixed2csv(self.input_file,
                          schema,
                          output=self.output_file,
                          **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              newline=True,
                                              **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file,
                                             sheet=kwargs.get('sheet'))
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file,
                                              sheet=kwargs.get('sheet'))
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError(
                        'DBF files can not be converted from stdin. You must pass a filename.'
                    )
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file)

        self.input_file.close()

        if self.args.schema:
            schema.close()
Esempio n. 8
0
    def main(self):
        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error('You must specify a format when providing data via STDIN (pipe).')
            filetype = convert.guess_format(self.args.input_path)
            if not filetype:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            # Streaming CSV musn't set sniff_limit, but non-streaming should.
            if not self.args.no_inference:
                kwargs['sniff_limit'] = self.args.sniff_limit
            if self.args.no_header_row:
                kwargs['header'] = False
        elif self.args.no_inference:
            # Streaming CSV musn't set column_types, but other formats should.
            kwargs['column_types'] = agate.TypeTester(limit=0)

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference:
            reader = agate.reader(self.input_file, **self.reader_kwargs)
            writer = agate.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet', None))
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet', None))
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError('DBF files can not be converted from stdin. You must pass a filename.')
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file)
Esempio n. 9
0
    def test_dbf(self):
        with open('examples/test_geojson.json', 'rb') as f:
            output = geojs.geojson2csv(f)

        with open('examples/test_geojson.csv', 'r') as f:
            self.assertEquals(f.read(), output)
Esempio n. 10
0
    def main(self):
        path = self.args.input_path

        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not path or path == '-':
                self.argparser.error('You must specify a format when providing input as piped data via STDIN.')
            filetype = convert.guess_format(path)
            if not filetype:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if self.args.names_only:
            sheets = self.sheet_names(path, filetype)
            if sheets:
                for sheet in sheets:
                    self.output_file.write('%s\n' % sheet)
            else:
                self.argparser.error('You cannot use the -n or --names options with non-Excel files.')
            return

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = self.open_excel_input_file(path)
        else:
            self.input_file = self._open_input_file(path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            kwargs['sniff_limit'] = self.args.sniff_limit

        if filetype in ('xls', 'xlsx'):
            kwargs['header'] = not self.args.no_header_row

        if filetype not in ('dbf', 'geojson', 'json', 'ndjson'):  # csv, fixed, xls, xlsx
            kwargs['skip_lines'] = self.args.skip_lines

        if filetype != 'dbf':
            kwargs['column_types'] = self.get_column_types()

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference and not self.args.no_header_row and not self.args.skip_lines and self.args.sniff_limit == 0:
            reader = agate.csv.reader(self.input_file, **self.reader_kwargs)
            writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file, sheet=self.args.sheet, encoding_override=self.args.encoding_xls, **kwargs)
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file, sheet=self.args.sheet, **kwargs)
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError('DBF files can not be converted from stdin. You must pass a filename.')
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file, **self.writer_kwargs)

        if self.args.write_sheets:
            # Close and re-open the file, as the file object has been mutated or closed.
            self.input_file.close()

            self.input_file = self.open_excel_input_file(path)

            if self.args.write_sheets == '-':
                sheets = self.sheet_names(path, filetype)
            else:
                sheets = [int(sheet) if sheet.isdigit() else sheet for sheet in self.args.write_sheets.split(',')]

            if filetype == 'xls':
                tables = agate.Table.from_xls(self.input_file, sheet=sheets, encoding_override=self.args.encoding_xls, **kwargs)
            elif filetype == 'xlsx':
                tables = agate.Table.from_xlsx(self.input_file, sheet=sheets, **kwargs)

            base = splitext(self.input_file.name)[0]
            for i, table in enumerate(tables.values()):
                with open('%s_%d.csv' % (base, i), 'w') as f:
                    table.to_csv(f, **self.writer_kwargs)

        self.input_file.close()

        if self.args.schema:
            schema.close()
Esempio n. 11
0
    def test_dbf(self):
        with open("examples/test_geojson.json", "rb") as f:
            output = geojs.geojson2csv(f)

        with open("examples/test_geojson.csv", "r") as f:
            self.assertEquals(f.read(), output)
Esempio n. 12
0
    def main(self):
        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' %
                                     self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error(
                    'You must specify a format when providing data via STDIN (pipe).'
                )
            filetype = convert.guess_format(self.args.input_path)
            if not filetype:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying a format with --format.'
                )

        # Buffer standard input if the input file is in CSV format or if performing type inference.
        self.buffers_input = filetype == 'csv' or not self.args.no_inference

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        if self.args.names_only:
            sheet_names = None
            if filetype == 'xls':
                sheet_names = xlrd.open_workbook(
                    file_contents=self.input_file.read()).sheet_names()
            elif filetype == 'xlsx':
                sheet_names = openpyxl.load_workbook(self.input_file,
                                                     read_only=True,
                                                     data_only=True).sheetnames
            if sheet_names:
                for name in sheet_names:
                    self.output_file.write('%s\n' % name)
            else:
                self.argparser.error(
                    'You cannot use the -n or --names options with non-Excel files.'
                )
            self.input_file.close()
            return

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            kwargs['sniff_limit'] = self.args.sniff_limit

        if filetype not in ('dbf', 'geojson', 'json', 'ndjson'):
            kwargs['skip_lines'] = self.args.skip_lines

        if filetype != 'dbf':
            kwargs['column_types'] = self.get_column_types()

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference and not self.args.skip_lines:
            reader = agate.csv.reader(self.input_file, **self.reader_kwargs)
            writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(
                fixed2csv(self.input_file,
                          schema,
                          output=self.output_file,
                          **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              newline=True,
                                              **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file, **kwargs)
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file, **kwargs)
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError(
                        'DBF files can not be converted from stdin. You must pass a filename.'
                    )
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file)

        self.input_file.close()

        if self.args.schema:
            schema.close()