Exemple #1
0
    def main(self):
        reader = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.dryrun:
            checker = RowChecker(reader)

            for row in checker.checked_rows():
                pass

            if checker.errors:
                for e in checker.errors:
                    self.output_file.write('Line %i: %s\n' %
                                           (e.line_number, e.msg))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write(
                    '%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n'
                    % (checker.rows_joined, checker.joins))
        else:
            base, ext = splitext(self.input_file.name)

            with open('%s_out.csv' % base, 'w') as f:
                clean_writer = agate.writer(f, **self.writer_kwargs)

                checker = RowChecker(reader)
                clean_writer.writerow(checker.column_names)

                for row in checker.checked_rows():
                    clean_writer.writerow(row)

            if checker.errors:
                error_filename = '%s_err.csv' % base

                with open(error_filename, 'w') as f:
                    error_writer = agate.writer(f, **self.writer_kwargs)

                    error_header = ['line_number', 'msg']
                    error_header.extend(checker.column_names)
                    error_writer.writerow(error_header)

                    error_count = len(checker.errors)

                    for e in checker.errors:
                        error_writer.writerow(self._format_error_row(e))

                self.output_file.write(
                    '%i error%s logged to %s\n' %
                    (error_count, '' if error_count == 1 else 's',
                     error_filename))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write(
                    '%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n'
                    % (checker.rows_joined, checker.joins))
Exemple #2
0
    def main(self):
        reader = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.dryrun:
            checker = RowChecker(reader)

            for row in checker.checked_rows():
                pass

            if checker.errors:
                for e in checker.errors:
                    self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write('%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins))
        else:
            if self.input_file == sys.stdin:
                base = 'stdin'  # "<stdin>_out.csv" is invalid on Windows
            else:
                base = splitext(self.input_file.name)[0]

            with open('%s_out.csv' % base, 'w') as f:
                clean_writer = agate.writer(f, **self.writer_kwargs)

                checker = RowChecker(reader)
                clean_writer.writerow(checker.column_names)

                for row in checker.checked_rows():
                    clean_writer.writerow(row)

            if checker.errors:
                error_filename = '%s_err.csv' % base

                with open(error_filename, 'w') as f:
                    error_writer = agate.writer(f, **self.writer_kwargs)

                    error_header = ['line_number', 'msg']
                    error_header.extend(checker.column_names)
                    error_writer.writerow(error_header)

                    error_count = len(checker.errors)

                    for e in checker.errors:
                        error_writer.writerow(self._format_error_row(e))

                self.output_file.write('%i error%s logged to %s\n' % (error_count, '' if error_count == 1 else 's', error_filename))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write('%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins))
Exemple #3
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if self.input_file.name != '<stdin>':
            # Use filename as table name
            table_name = os.path.splitext(
                os.path.split(self.input_file.name)[1])[0]
        else:
            table_name = 'csvsql_table'

        tab = table.Table.from_csv(self.input_file,
                                   name=table_name,
                                   snifflimit=self.args.snifflimit,
                                   no_header_row=self.args.no_header_row,
                                   infer_types=(not self.args.no_inference),
                                   **self.reader_kwargs)

        column_ids = parse_column_identifiers(self.args.columns, tab.headers(),
                                              self.args.zero_based)

        rows = tab.to_rows(serialize_dates=True)
        sorter = lambda r: [(r[c] is not None, r[c]) for c in column_ids]
        rows.sort(key=sorter, reverse=self.args.reverse)

        rows.insert(0, tab.headers())

        output = agate.writer(self.output_file, **self.writer_kwargs)

        for row in rows:
            output.writerow(row)
Exemple #4
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)
        output = agate.writer(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for row in rows:
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue

            output.writerow(out_row)
Exemple #5
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.columns:
            self.argparser.error('You must specify at least one column to search using the -c option.')

        if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None:
            self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.')

        reader_kwargs = self.reader_kwargs
        writer_kwargs = self.writer_kwargs
        if writer_kwargs.pop('line_numbers', False):
            reader_kwargs = {'line_numbers': True}

        rows, column_names, column_ids = self.get_rows_and_column_names_and_column_ids(**reader_kwargs)

        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern

        patterns = dict((column_id, pattern) for column_id in column_ids)
        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        output = agate.writer(self.output_file, **writer_kwargs)
        output.writerow(column_names)

        for row in filter_reader:
            output.writerow(row)
Exemple #6
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names,
                                              self.args.zero_based,
                                              self.args.not_columns)
        output = agate.writer(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for row in rows:
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue

            output.writerow(out_row)
Exemple #7
0
    def main(self):
        try:
            engine, metadata = sql.get_connection(self.args.connection_string)
        except ImportError:
            raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n')

        conn = engine.connect()

        if self.args.query:
            query = self.args.query.strip()
        else:
            query = ""

            for line in self.args.file:
                query += line

        rows = conn.execute(query)
        output = agate.writer(self.output_file, **self.writer_kwargs)

        if not self.args.no_header_row:
            output.writerow(rows._metadata.keys)

        for row in rows:
            output.writerow(row)

        conn.close()
Exemple #8
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.columns:
            self.argparser.error('You must specify at least one column to search using the -c option.')

        if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None:
            self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.')

        rows = agate.reader(self.input_file, **self.reader_kwargs)
        column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)

        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern

        patterns = dict((c, pattern) for c in column_ids)

        output = agate.writer(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for row in filter_reader:
            output.writerow(row)
Exemple #9
0
    def main(self):
        try:
            engine, metadata = sql.get_connection(self.args.connection_string)
        except ImportError:
            raise ImportError(
                'You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n'
            )

        conn = engine.connect()

        if self.args.query:
            query = self.args.query.strip()
        else:
            query = ""

            for line in self.args.file:
                query += line

        # Must escape '%'.
        # @see https://github.com/onyxfish/csvkit/issues/440
        # @see https://bitbucket.org/zzzeek/sqlalchemy/commits/5bc1f17cb53248e7cea609693a3b2a9bb702545b
        rows = conn.execute(query.replace('%', '%%'))
        output = agate.writer(self.output_file, **self.writer_kwargs)

        if rows.returns_rows:
            if not self.args.no_header_row:
                output.writerow(rows._metadata.keys)

            for row in rows:
                output.writerow(row)

        conn.close()
Exemple #10
0
    def main(self):
        try:
            engine, metadata = sql.get_connection(self.args.connection_string)
        except ImportError:
            raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n')

        conn = engine.connect()

        if self.args.query:
            query = self.args.query.strip()
        else:
            query = ""

            for line in self.args.file:
                query += line

        # Must escape '%'.
        # @see https://github.com/onyxfish/csvkit/issues/440
        # @see https://bitbucket.org/zzzeek/sqlalchemy/commits/5bc1f17cb53248e7cea609693a3b2a9bb702545b
        rows = conn.execute(query.replace('%', '%%'))
        output = agate.writer(self.output_file, **self.writer_kwargs)

        if rows.returns_rows:
            if not self.args.no_header_row:
                output.writerow(rows._metadata.keys)

            for row in rows:
                output.writerow(row)

        conn.close()
Exemple #11
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if self.input_file.name != '<stdin>':
            # Use filename as table name
            table_name = os.path.splitext(os.path.split(self.input_file.name)[1])[0]
        else:
            table_name = 'csvsql_table'

        tab = table.Table.from_csv(
            self.input_file,
            name=table_name,
            snifflimit=self.args.snifflimit,
            no_header_row=self.args.no_header_row,
            infer_types=(not self.args.no_inference),
            **self.reader_kwargs
        )

        column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based)

        rows = tab.to_rows(serialize_dates=True)
        sorter = lambda r: [(r[c] is not None, r[c]) for c in column_ids]
        rows.sort(key=sorter, reverse=self.args.reverse)

        rows.insert(0, tab.headers())

        output = agate.writer(self.output_file, **self.writer_kwargs)

        for row in rows:
            output.writerow(row)
Exemple #12
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if not self.input_files:
            self.argparser.error('You must specify at least one file to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.input_files]
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.input_files):
                self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.')
        else:
            groups = None

        group_name = self.args.group_name if self.args.group_name else 'group'

        output = agate.writer(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.input_files):
            rows = agate.reader(f, **self.reader_kwargs)

            # If we have header rows, use them
            if not self.args.no_header_row:
                headers = next(rows, [])

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)
            # If we don't generate simple column names based on first row
            else:
                row = next(rows, [])

                headers = make_default_headers(len(row))

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)

                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            f.close()
Exemple #13
0
def geojson2csv(f, key=None, **kwargs):
    """
    Convert a GeoJSON document into CSV format.
    """
    js = json.load(f, object_pairs_hook=OrderedDict)

    if not isinstance(js, dict):
        raise TypeError('JSON document is not valid GeoJSON: Root element is not an object.')

    if 'type' not in js:
        raise TypeError('JSON document is not valid GeoJSON: No top-level "type" key.')

    if js['type'] != 'FeatureCollection':
        raise TypeError('Only GeoJSON with root FeatureCollection type is supported. Not %s' % js['type'])

    if 'features' not in js:
        raise TypeError('JSON document is not a valid FeatureCollection: No top-level "features" key.')

    features = js['features']

    features_parsed = []    # tuples in the format (id, properties, geometry)
    property_fields = []

    for feature in features:
        geoid = feature.get('id', None)

        properties = feature.get('properties') or {}

        for prop in properties.keys():
            if prop not in property_fields:
                property_fields.append(prop)

        geometry = json.dumps(feature['geometry'])

        features_parsed.append((geoid, properties, geometry))

    header = ['id']
    header.extend(property_fields)
    header.append('geojson')

    o = six.StringIO()
    writer = agate.writer(o)

    writer.writerow(header)

    for geoid, properties, geometry in features_parsed:
        row = [geoid]

        for field in property_fields:
            row.append(properties.get(field, None))

        row.append(geometry)

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
Exemple #14
0
def xlsx2csv(f, output=None, **kwargs):
    """
    Convert an Excel .xlsx file to csv.

    Note: Unlike other convertor's, this one allows output columns to contain mixed data types.
    Blank headers are also possible.
    """
    streaming = True if output else False

    if not streaming:
        output = six.StringIO()

    writer = agate.writer(output)

    book = load_workbook(f, use_iterators=True, data_only=True)

    if 'sheet' in kwargs:
        sheet = book.get_sheet_by_name(kwargs['sheet'])
    else:
        sheet = book.get_active_sheet()

    for i, row in enumerate(sheet.iter_rows()):
        if i == 0:
            writer.writerow([c.value for c in row])
            continue

        out_row = []

        for c in row:
            value = c.value

            if value.__class__ is datetime.datetime:
                # Handle default XLSX date as 00:00 time
                if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c):
                    value = value.time()

                    value = normalize_datetime(value)
                elif value.time() == NULL_TIME:
                    value = value.date()
                else:
                    value = normalize_datetime(value)
            elif value.__class__ is float:
                if value % 1 == 0:
                    value = int(value)

            if value.__class__ in (datetime.datetime, datetime.date, datetime.time):
                value = value.isoformat()

            out_row.append(value)

        writer.writerow(out_row)

    if not streaming:
        data = output.getvalue()
        return data

    # Return empty string when streaming
    return ''
Exemple #15
0
    def to_csv(self, output, **kwargs):
        """
        Serializes the table to CSV and writes it to any file-like object.
        """
        rows = self.to_rows(serialize_dates=True)

        # Insert header row
        rows.insert(0, self.headers())

        csv_writer = agate.writer(output, **kwargs)
        csv_writer.writerows(rows)
Exemple #16
0
    def to_csv(self, output, **kwargs):
        """
        Serializes the table to CSV and writes it to any file-like object.
        """
        rows = self.to_rows(serialize_dates=True)

        # Insert header row
        rows.insert(0, self.headers())

        csv_writer = agate.writer(output, **kwargs)
        csv_writer.writerows(rows)
Exemple #17
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows, column_names, column_ids = self.get_rows_and_column_names_and_column_ids(**self.reader_kwargs)

        output = agate.writer(self.output_file, **self.writer_kwargs)
        output.writerow([column_names[column_id] for column_id in column_ids])

        for row in rows:
            out_row = [row[column_id] if column_id < len(row) else None for column_id in column_ids]
            if not self.args.delete_empty or ''.join(out_row):
                output.writerow(out_row)
Exemple #18
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.columns:
            self.argparser.error(
                'You must specify at least one column to search using the -c option.'
            )

        if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None:
            self.argparser.error(
                'One of -r, -m or -f must be specified, unless using the -n option.'
            )

        rows = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names,
                                              self.args.zero_based)

        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern

        patterns = dict((c, pattern) for c in column_ids)

        output = agate.writer(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows,
                                           header=False,
                                           patterns=patterns,
                                           inverse=self.args.inverse)

        for row in filter_reader:
            output.writerow(row)
Exemple #19
0
def json2csv(f, key=None, **kwargs):
    """
    Convert a JSON document into CSV format.

    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
    """
    js = json.load(f, object_pairs_hook=OrderedDict)

    if isinstance(js, dict):
        if not key:
            raise TypeError(
                'When converting a JSON document with a top-level dictionary element, a key must be specified.'
            )

        js = js[key]

    fields = []
    flat = []

    for obj in js:
        parsed_object = parse_object(obj)
        flat.append(parsed_object)

        for key in parsed_object.keys():
            if key not in fields:
                fields.append(key)

    o = six.StringIO()
    writer = agate.writer(o)

    writer.writerow(fields)

    for i in flat:
        row = []

        for field in fields:
            row.append(i.get(field, None))

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
Exemple #20
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.columns:
            self.argparser.error(
                'You must specify at least one column to search using the -c option.'
            )

        if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None:
            self.argparser.error(
                'One of -r, -m or -f must be specified, unless using the -n option.'
            )

        reader_kwargs = self.reader_kwargs
        writer_kwargs = self.writer_kwargs
        if writer_kwargs.pop('line_numbers', False):
            reader_kwargs = {'line_numbers': True}

        rows, column_names, column_ids = self.get_rows_and_column_names_and_column_ids(
            **reader_kwargs)

        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern

        patterns = dict((column_id, pattern) for column_id in column_ids)
        filter_reader = FilteringCSVReader(rows,
                                           header=False,
                                           patterns=patterns,
                                           inverse=self.args.inverse)

        output = agate.writer(self.output_file, **writer_kwargs)
        output.writerow(column_names)

        for row in filter_reader:
            output.writerow(row)
Exemple #21
0
def json2csv(f, key=None, **kwargs):
    """
    Convert a JSON document into CSV format.

    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
    """
    js = json.load(f, object_pairs_hook=OrderedDict)

    if isinstance(js, dict):
        if not key:
            raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.')

        js = js[key]

    fields = []
    flat = []

    for obj in js:
        parsed_object = parse_object(obj)
        flat.append(parsed_object)

        for key in parsed_object.keys():
            if key not in fields:
                fields.append(key)

    o = six.StringIO()
    writer = agate.writer(o)

    writer.writerow(fields)

    for i in flat:
        row = []

        for field in fields:
            row.append(i.get(field, None))

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
Exemple #22
0
def ndjson2csv(f, key=None, **kwargs):
    """
    Convert a JSON document into CSV format.

    Supports both JSON and "Newline-delimited JSON".

    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
    """
    first_line = f.readline()

    first_row = json.loads(first_line, object_pairs_hook=OrderedDict)
    js = itertools.chain(
        (first_row, ),
        (json.loads(l, object_pairs_hook=OrderedDict) for l in f))

    fields = []
    flat = []

    for obj in js:
        flat.append(parse_object(obj))

        for key in obj.keys():
            if key not in fields:
                fields.append(key)

    o = six.StringIO()
    writer = agate.writer(o)

    writer.writerow(fields)

    for i in flat:
        row = []

        for field in fields:
            row.append(i.get(field, None))

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
Exemple #23
0
def ndjson2csv(f, key=None, **kwargs):
    """
    Convert a JSON document into CSV format.

    Supports both JSON and "Newline-delimited JSON".

    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
    """
    first_line = f.readline()

    first_row = json.loads(first_line, object_pairs_hook=OrderedDict)
    js = itertools.chain((first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f))

    fields = []
    flat = []

    for obj in js:
        flat.append(parse_object(obj))

        for key in obj.keys():
            if key not in fields:
                fields.append(key)

    o = six.StringIO()
    writer = agate.writer(o)

    writer.writerow(fields)

    for i in flat:
        row = []

        for field in fields:
            row.append(i.get(field, None))

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
Exemple #24
0
def fixed2csv(f, schema, output=None, **kwargs):
    """
    Convert a fixed-width file to csv using a CSV-formatted schema description.

    A schema CSV must start with a header row with (at least) columns labeled
    "column","start", and "length". (Other columns will be ignored.) For each
    subsequent row, therefore, those columns will be used to identify a column
    name, the starting index of the column (an integer), and the length of the
    column (also an integer).

    Values in the 'start' column are assumed to be zero-based, unless the first
    value for 'start' is 1, in which case all values are assumed to be
    one-based.

    If output is specified, rows will be written to that object, otherwise the
    complete data will be returned.
    """
    streaming = True if output else False

    if not streaming:
        output = six.StringIO()

    try:
        encoding = kwargs['encoding']
    except KeyError:
        encoding = None

    writer = agate.writer(output)

    reader = FixedWidthReader(f, schema, encoding=encoding)
    writer.writerows(reader)

    if not streaming:
        data = output.getvalue()
        return data

    # Return empty string when streaming
    return ''
Exemple #25
0
    def main(self):
        connection_string = self.args.connection_string
        do_insert = self.args.insert
        query = self.args.query

        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if self.args.table_names:
            table_names = self.args.table_names.split(',')
        else:
            table_names = []

        # If one or more filenames are specified, we need to add stdin ourselves (if available)
        if sys.stdin not in self.input_files:
            try:
                if not sys.stdin.isatty():
                    self.input_files.insert(0, sys.stdin)
            except:
                pass

        # Create an SQLite database in memory if no connection string is specified
        if query and not connection_string:
            connection_string = "sqlite:///:memory:"
            do_insert = True

        if self.args.dialect and connection_string:
            self.argparser.error('The --dialect option is only valid when --db is not specified.')

        if do_insert and not connection_string:
            self.argparser.error('The --insert option is only valid when --db is also specified.')

        if self.args.no_create and not do_insert:
            self.argparser.error('The --no-create option is only valid --insert is also specified.')

        # Establish database validity before reading CSV files
        if connection_string:
            try:
                engine, metadata = sql.get_connection(connection_string)
            except ImportError:
                raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n')
            conn = engine.connect()
            trans = conn.begin()

        for f in self.input_files:
            try:
                # Try to use name specified via --table
                table_name = table_names.pop(0)
            except IndexError:
                if f == sys.stdin:
                    table_name = "stdin"
                else:
                    # Use filename as table name
                    table_name = os.path.splitext(os.path.split(f.name)[1])[0]

            csv_table = table.Table.from_csv(
                f,
                name=table_name,
                snifflimit=self.args.snifflimit,
                blanks_as_nulls=(not self.args.blanks),
                infer_types=(not self.args.no_inference),
                no_header_row=self.args.no_header_row,
                **self.reader_kwargs
            )

            f.close()

            if connection_string:
                sql_table = sql.make_table(
                    csv_table,
                    table_name,
                    self.args.no_constraints,
                    self.args.db_schema,
                    metadata
                )

                # Create table
                if not self.args.no_create:
                    sql_table.create()

                # Insert data
                if do_insert and csv_table.count_rows() > 0:
                    insert = sql_table.insert()
                    headers = csv_table.headers()
                    conn.execute(insert, [dict(zip(headers, row)) for row in csv_table.to_rows()])

            # Output SQL statements
            else:
                sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints)
                self.output_file.write('%s\n' % sql.make_create_table_statement(sql_table, dialect=self.args.dialect))

        if connection_string:
            if query:
                # Execute specified SQL queries
                queries = query.split(';')
                rows = None

                for q in queries:
                    if q:
                        rows = conn.execute(q)

                # Output result of last query as CSV
                try:
                    output = agate.writer(self.output_file, **self.writer_kwargs)
                    if not self.args.no_header_row:
                        output.writerow(rows._metadata.keys)
                    for row in rows:
                        output.writerow(row)
                except AttributeError:
                    pass

            trans.commit()
            conn.close()
Exemple #26
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if not self.input_files:
            self.argparser.error(
                'You must specify at least one file to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.input_files]
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.input_files):
                self.argparser.error(
                    'The number of grouping values must be equal to the number of CSV files being stacked.'
                )
        else:
            groups = None

        group_name = self.args.group_name if self.args.group_name else 'group'

        output = agate.writer(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.input_files):
            rows = agate.reader(f, **self.reader_kwargs)

            # If we have header rows, use them
            if not self.args.no_header_row:
                headers = next(rows, [])

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)
            # If we don't generate simple column names based on first row
            else:
                row = next(rows, [])

                headers = make_default_headers(len(row))

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)

                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            f.close()
Exemple #27
0
    def main(self):
        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' %
                                     self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error(
                    'You must specify a format when providing data via STDIN (pipe).'
                )
            filetype = convert.guess_format(self.args.input_path)
            if not filetype:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying a format with --format.'
                )

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            # Streaming CSV musn't set sniff_limit, but non-streaming should.
            if not self.args.no_inference:
                kwargs['sniff_limit'] = self.args.sniff_limit
            if self.args.no_header_row:
                kwargs['header'] = False
        elif self.args.no_inference:
            # Streaming CSV musn't set column_types, but other formats should.
            kwargs['column_types'] = agate.TypeTester(limit=0)

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference:
            reader = agate.reader(self.input_file, **self.reader_kwargs)
            writer = agate.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(
                fixed2csv(self.input_file,
                          schema,
                          output=self.output_file,
                          **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              newline=True,
                                              **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file,
                                             sheet=kwargs.get('sheet', None))
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file,
                                              sheet=kwargs.get('sheet', None))
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError(
                        'DBF files can not be converted from stdin. You must pass a filename.'
                    )
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file)
Exemple #28
0
    def main(self):
        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error('You must specify a format when providing data via STDIN (pipe).')
            filetype = convert.guess_format(self.args.input_path)
            if not filetype:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            # Streaming CSV musn't set sniff_limit, but non-streaming should.
            if not self.args.no_inference:
                kwargs['sniff_limit'] = self.args.sniff_limit
            if self.args.no_header_row:
                kwargs['header'] = False
        elif self.args.no_inference:
            # Streaming CSV musn't set column_types, but other formats should.
            kwargs['column_types'] = agate.TypeTester(limit=0)

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference:
            reader = agate.reader(self.input_file, **self.reader_kwargs)
            writer = agate.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet', None))
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet', None))
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError('DBF files can not be converted from stdin. You must pass a filename.')
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file)
Exemple #29
0
 def main(self):
     reader = agate.reader(self.input_file, **self.reader_kwargs)
     writer = agate.writer(self.output_file, **self.writer_kwargs)
     writer.writerows(reader)
Exemple #30
0
    def main(self):
        reader = agate.reader(self.input_file, **self.reader_kwargs)

        writer = agate.writer(self.output_file, **self.writer_kwargs)

        writer.writerows(reader)
Exemple #31
0
    def main(self):
        connection_string = self.args.connection_string
        do_insert = self.args.insert
        query = self.args.query

        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if self.args.table_names:
            table_names = self.args.table_names.split(',')
        else:
            table_names = []

        # If one or more filenames are specified, we need to add stdin ourselves (if available)
        if sys.stdin not in self.input_files:
            try:
                if not sys.stdin.isatty():
                    self.input_files.insert(0, sys.stdin)
            except:
                pass

        # Create an SQLite database in memory if no connection string is specified
        if query and not connection_string:
            connection_string = "sqlite:///:memory:"
            do_insert = True

        if self.args.dialect and connection_string:
            self.argparser.error(
                'The --dialect option is only valid when --db is not specified.'
            )

        if do_insert and not connection_string:
            self.argparser.error(
                'The --insert option is only valid when --db is also specified.'
            )

        if self.args.no_create and not do_insert:
            self.argparser.error(
                'The --no-create option is only valid --insert is also specified.'
            )

        # Establish database validity before reading CSV files
        if connection_string:
            try:
                engine, metadata = sql.get_connection(connection_string)
            except ImportError:
                raise ImportError(
                    'You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n'
                )
            conn = engine.connect()
            trans = conn.begin()

        for f in self.input_files:
            try:
                # Try to use name specified via --table
                table_name = table_names.pop(0)
            except IndexError:
                if f == sys.stdin:
                    table_name = "stdin"
                else:
                    # Use filename as table name
                    table_name = os.path.splitext(os.path.split(f.name)[1])[0]

            csv_table = table.Table.from_csv(
                f,
                name=table_name,
                sniff_limit=self.args.sniff_limit,
                blanks_as_nulls=(not self.args.blanks),
                infer_types=(not self.args.no_inference),
                no_header_row=self.args.no_header_row,
                **self.reader_kwargs)

            f.close()

            if csv_table:
                if connection_string:
                    sql_table = sql.make_table(csv_table, table_name,
                                               self.args.no_constraints,
                                               self.args.db_schema, metadata)

                    # Create table
                    if not self.args.no_create:
                        sql_table.create()

                    # Insert data
                    if do_insert and csv_table.count_rows() > 0:
                        insert = sql_table.insert()
                        headers = csv_table.headers()
                        conn.execute(insert, [
                            dict(zip(headers, row))
                            for row in csv_table.to_rows()
                        ])

                # Output SQL statements
                else:
                    sql_table = sql.make_table(csv_table, table_name,
                                               self.args.no_constraints)
                    self.output_file.write(
                        '%s\n' % sql.make_create_table_statement(
                            sql_table, dialect=self.args.dialect))

        if connection_string:
            if query:
                # Execute specified SQL queries
                queries = query.split(';')
                rows = None

                for q in queries:
                    if q:
                        rows = conn.execute(q)

                # Output result of last query as CSV
                try:
                    output = agate.writer(self.output_file,
                                          **self.writer_kwargs)
                    if not self.args.no_header_row:
                        output.writerow(rows._metadata.keys)
                    for row in rows:
                        output.writerow(row)
                except AttributeError:
                    pass

            trans.commit()
            conn.close()
Exemple #32
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if len(self.input_files) < 2:
            self.argparser.error('You must specify at least two files to join.')

        if self.args.columns:
            join_column_names = self._parse_join_column_names(self.args.columns)

            if len(join_column_names) == 1:
                join_column_names = join_column_names * len(self.input_files)

            if len(join_column_names) != len(self.input_files):
                self.argparser.error('The number of join column names must match the number of files, or be a single column name that exists in all files.')

        if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns:
            self.argparser.error('You must provide join column names when performing an outer join.')

        if self.args.left_join and self.args.right_join:
             self.argparser.error('It is not valid to specify both a left and a right join.')

        tables = []

        for f in self.input_files:
            tables.append(list(agate.reader(f, **self.reader_kwargs)))
            f.close()

        join_column_ids = []

        if self.args.columns:
            for i, t in enumerate(tables):
                join_column_ids.append(match_column_identifier(t[0], join_column_names[i]))

        jointab = []

        if self.args.left_join:
            # Left outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
        elif self.args.right_join:
            # Right outer join
            jointab = tables[-1]

            remaining_tables = tables[:-1]
            remaining_tables.reverse()

            for i, t in enumerate(remaining_tables):
                jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1])
        elif self.args.outer_join:
            # Full outer join
            jointab = tables[0]

            for i, t in enumerate(tables[1:]):
                jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
        else:
            if self.args.columns:
                # Inner join
                jointab = tables[0]

                for i, t in enumerate(tables[1:]):
                    jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
            else:
                jointab = tables[0]

                # Sequential join
                for t in tables[1:]:
                    jointab = join.sequential_join(jointab, t)

        output = agate.writer(self.output_file, **self.writer_kwargs)

        for row in jointab:
            output.writerow(row)