def main(self): reader = agate.reader(self.input_file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errors: for e in checker.errors: self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write( '%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins)) else: base, ext = splitext(self.input_file.name) with open('%s_out.csv' % base, 'w') as f: clean_writer = agate.writer(f, **self.writer_kwargs) checker = RowChecker(reader) clean_writer.writerow(checker.column_names) for row in checker.checked_rows(): clean_writer.writerow(row) if checker.errors: error_filename = '%s_err.csv' % base with open(error_filename, 'w') as f: error_writer = agate.writer(f, **self.writer_kwargs) error_header = ['line_number', 'msg'] error_header.extend(checker.column_names) error_writer.writerow(error_header) error_count = len(checker.errors) for e in checker.errors: error_writer.writerow(self._format_error_row(e)) self.output_file.write( '%i error%s logged to %s\n' % (error_count, '' if error_count == 1 else 's', error_filename)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write( '%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins))
def main(self): reader = agate.reader(self.input_file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errors: for e in checker.errors: self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write('%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins)) else: if self.input_file == sys.stdin: base = 'stdin' # "<stdin>_out.csv" is invalid on Windows else: base = splitext(self.input_file.name)[0] with open('%s_out.csv' % base, 'w') as f: clean_writer = agate.writer(f, **self.writer_kwargs) checker = RowChecker(reader) clean_writer.writerow(checker.column_names) for row in checker.checked_rows(): clean_writer.writerow(row) if checker.errors: error_filename = '%s_err.csv' % base with open(error_filename, 'w') as f: error_writer = agate.writer(f, **self.writer_kwargs) error_header = ['line_number', 'msg'] error_header.extend(checker.column_names) error_writer.writerow(error_header) error_count = len(checker.errors) for e in checker.errors: error_writer.writerow(self._format_error_row(e)) self.output_file.write('%i error%s logged to %s\n' % (error_count, '' if error_count == 1 else 's', error_filename)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write('%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins))
def main(self): if self.args.names_only: self.print_column_names() return if self.input_file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext( os.path.split(self.input_file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv(self.input_file, name=table_name, snifflimit=self.args.snifflimit, no_header_row=self.args.no_header_row, infer_types=(not self.args.no_inference), **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based) rows = tab.to_rows(serialize_dates=True) sorter = lambda r: [(r[c] is not None, r[c]) for c in column_ids] rows.sort(key=sorter, reverse=self.args.reverse) rows.insert(0, tab.headers()) output = agate.writer(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return rows = agate.reader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error('You must specify at least one column to search using the -c option.') if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.') reader_kwargs = self.reader_kwargs writer_kwargs = self.writer_kwargs if writer_kwargs.pop('line_numbers', False): reader_kwargs = {'line_numbers': True} rows, column_names, column_ids = self.get_rows_and_column_names_and_column_ids(**reader_kwargs) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((column_id, pattern) for column_id in column_ids) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) output = agate.writer(self.output_file, **writer_kwargs) output.writerow(column_names) for row in filter_reader: output.writerow(row)
def main(self): try: engine, metadata = sql.get_connection(self.args.connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') conn = engine.connect() if self.args.query: query = self.args.query.strip() else: query = "" for line in self.args.file: query += line rows = conn.execute(query) output = agate.writer(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) conn.close()
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error('You must specify at least one column to search using the -c option.') if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.') rows = agate.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for row in filter_reader: output.writerow(row)
def main(self): try: engine, metadata = sql.get_connection(self.args.connection_string) except ImportError: raise ImportError( 'You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n' ) conn = engine.connect() if self.args.query: query = self.args.query.strip() else: query = "" for line in self.args.file: query += line # Must escape '%'. # @see https://github.com/onyxfish/csvkit/issues/440 # @see https://bitbucket.org/zzzeek/sqlalchemy/commits/5bc1f17cb53248e7cea609693a3b2a9bb702545b rows = conn.execute(query.replace('%', '%%')) output = agate.writer(self.output_file, **self.writer_kwargs) if rows.returns_rows: if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) conn.close()
def main(self): try: engine, metadata = sql.get_connection(self.args.connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') conn = engine.connect() if self.args.query: query = self.args.query.strip() else: query = "" for line in self.args.file: query += line # Must escape '%'. # @see https://github.com/onyxfish/csvkit/issues/440 # @see https://bitbucket.org/zzzeek/sqlalchemy/commits/5bc1f17cb53248e7cea609693a3b2a9bb702545b rows = conn.execute(query.replace('%', '%%')) output = agate.writer(self.output_file, **self.writer_kwargs) if rows.returns_rows: if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) conn.close()
def main(self): if self.args.names_only: self.print_column_names() return if self.input_file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext(os.path.split(self.input_file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv( self.input_file, name=table_name, snifflimit=self.args.snifflimit, no_header_row=self.args.no_header_row, infer_types=(not self.args.no_inference), **self.reader_kwargs ) column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based) rows = tab.to_rows(serialize_dates=True) sorter = lambda r: [(r[c] is not None, r[c]) for c in column_ids] rows.sort(key=sorter, reverse=self.args.reverse) rows.insert(0, tab.headers()) output = agate.writer(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if not self.input_files: self.argparser.error('You must specify at least one file to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.input_files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.input_files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = agate.writer(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.input_files): rows = agate.reader(f, **self.reader_kwargs) # If we have header rows, use them if not self.args.no_header_row: headers = next(rows, []) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) # If we don't generate simple column names based on first row else: row = next(rows, []) headers = make_default_headers(len(row)) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) if groups: row.insert(0, groups[i]) output.writerow(row) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row) f.close()
def geojson2csv(f, key=None, **kwargs): """ Convert a GeoJSON document into CSV format. """ js = json.load(f, object_pairs_hook=OrderedDict) if not isinstance(js, dict): raise TypeError('JSON document is not valid GeoJSON: Root element is not an object.') if 'type' not in js: raise TypeError('JSON document is not valid GeoJSON: No top-level "type" key.') if js['type'] != 'FeatureCollection': raise TypeError('Only GeoJSON with root FeatureCollection type is supported. Not %s' % js['type']) if 'features' not in js: raise TypeError('JSON document is not a valid FeatureCollection: No top-level "features" key.') features = js['features'] features_parsed = [] # tuples in the format (id, properties, geometry) property_fields = [] for feature in features: geoid = feature.get('id', None) properties = feature.get('properties') or {} for prop in properties.keys(): if prop not in property_fields: property_fields.append(prop) geometry = json.dumps(feature['geometry']) features_parsed.append((geoid, properties, geometry)) header = ['id'] header.extend(property_fields) header.append('geojson') o = six.StringIO() writer = agate.writer(o) writer.writerow(header) for geoid, properties, geometry in features_parsed: row = [geoid] for field in property_fields: row.append(properties.get(field, None)) row.append(geometry) writer.writerow(row) output = o.getvalue() o.close() return output
def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False if not streaming: output = six.StringIO() writer = agate.writer(output) book = load_workbook(f, use_iterators=True, data_only=True) if 'sheet' in kwargs: sheet = book.get_sheet_by_name(kwargs['sheet']) else: sheet = book.get_active_sheet() for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.value for c in row]) continue out_row = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def to_csv(self, output, **kwargs): """ Serializes the table to CSV and writes it to any file-like object. """ rows = self.to_rows(serialize_dates=True) # Insert header row rows.insert(0, self.headers()) csv_writer = agate.writer(output, **kwargs) csv_writer.writerows(rows)
def main(self): if self.args.names_only: self.print_column_names() return rows, column_names, column_ids = self.get_rows_and_column_names_and_column_ids(**self.reader_kwargs) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow([column_names[column_id] for column_id in column_ids]) for row in rows: out_row = [row[column_id] if column_id < len(row) else None for column_id in column_ids] if not self.args.delete_empty or ''.join(out_row): output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error( 'You must specify at least one column to search using the -c option.' ) if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error( 'One of -r, -m or -f must be specified, unless using the -n option.' ) rows = agate.reader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for row in filter_reader: output.writerow(row)
def json2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ js = json.load(f, object_pairs_hook=OrderedDict) if isinstance(js, dict): if not key: raise TypeError( 'When converting a JSON document with a top-level dictionary element, a key must be specified.' ) js = js[key] fields = [] flat = [] for obj in js: parsed_object = parse_object(obj) flat.append(parsed_object) for key in parsed_object.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = agate.writer(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error( 'You must specify at least one column to search using the -c option.' ) if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error( 'One of -r, -m or -f must be specified, unless using the -n option.' ) reader_kwargs = self.reader_kwargs writer_kwargs = self.writer_kwargs if writer_kwargs.pop('line_numbers', False): reader_kwargs = {'line_numbers': True} rows, column_names, column_ids = self.get_rows_and_column_names_and_column_ids( **reader_kwargs) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((column_id, pattern) for column_id in column_ids) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) output = agate.writer(self.output_file, **writer_kwargs) output.writerow(column_names) for row in filter_reader: output.writerow(row)
def json2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ js = json.load(f, object_pairs_hook=OrderedDict) if isinstance(js, dict): if not key: raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.') js = js[key] fields = [] flat = [] for obj in js: parsed_object = parse_object(obj) flat.append(parsed_object) for key in parsed_object.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = agate.writer(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def ndjson2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. Supports both JSON and "Newline-delimited JSON". The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ first_line = f.readline() first_row = json.loads(first_line, object_pairs_hook=OrderedDict) js = itertools.chain( (first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f)) fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = agate.writer(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def ndjson2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. Supports both JSON and "Newline-delimited JSON". The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ first_line = f.readline() first_row = json.loads(first_line, object_pairs_hook=OrderedDict) js = itertools.chain((first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f)) fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = agate.writer(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def fixed2csv(f, schema, output=None, **kwargs): """ Convert a fixed-width file to csv using a CSV-formatted schema description. A schema CSV must start with a header row with (at least) columns labeled "column","start", and "length". (Other columns will be ignored.) For each subsequent row, therefore, those columns will be used to identify a column name, the starting index of the column (an integer), and the length of the column (also an integer). Values in the 'start' column are assumed to be zero-based, unless the first value for 'start' is 1, in which case all values are assumed to be one-based. If output is specified, rows will be written to that object, otherwise the complete data will be returned. """ streaming = True if output else False if not streaming: output = six.StringIO() try: encoding = kwargs['encoding'] except KeyError: encoding = None writer = agate.writer(output) reader = FixedWidthReader(f, schema, encoding=encoding) writer.writerows(reader) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def main(self): connection_string = self.args.connection_string do_insert = self.args.insert query = self.args.query self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if self.args.table_names: table_names = self.args.table_names.split(',') else: table_names = [] # If one or more filenames are specified, we need to add stdin ourselves (if available) if sys.stdin not in self.input_files: try: if not sys.stdin.isatty(): self.input_files.insert(0, sys.stdin) except: pass # Create an SQLite database in memory if no connection string is specified if query and not connection_string: connection_string = "sqlite:///:memory:" do_insert = True if self.args.dialect and connection_string: self.argparser.error('The --dialect option is only valid when --db is not specified.') if do_insert and not connection_string: self.argparser.error('The --insert option is only valid when --db is also specified.') if self.args.no_create and not do_insert: self.argparser.error('The --no-create option is only valid --insert is also specified.') # Establish database validity before reading CSV files if connection_string: try: engine, metadata = sql.get_connection(connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') conn = engine.connect() trans = conn.begin() for f in self.input_files: try: # Try to use name specified via --table table_name = table_names.pop(0) except IndexError: if f == sys.stdin: table_name = "stdin" else: # Use filename as table name table_name = os.path.splitext(os.path.split(f.name)[1])[0] csv_table = table.Table.from_csv( f, name=table_name, snifflimit=self.args.snifflimit, blanks_as_nulls=(not self.args.blanks), infer_types=(not self.args.no_inference), no_header_row=self.args.no_header_row, **self.reader_kwargs ) f.close() if connection_string: sql_table = sql.make_table( csv_table, table_name, self.args.no_constraints, self.args.db_schema, metadata ) # Create table if not self.args.no_create: sql_table.create() # Insert data if do_insert and csv_table.count_rows() > 0: insert = sql_table.insert() headers = csv_table.headers() conn.execute(insert, [dict(zip(headers, row)) for row in csv_table.to_rows()]) # Output SQL statements else: sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints) self.output_file.write('%s\n' % sql.make_create_table_statement(sql_table, dialect=self.args.dialect)) if connection_string: if query: # Execute specified SQL queries queries = query.split(';') rows = None for q in queries: if q: rows = conn.execute(q) # Output result of last query as CSV try: output = agate.writer(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) except AttributeError: pass trans.commit() conn.close()
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if not self.input_files: self.argparser.error( 'You must specify at least one file to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.input_files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.input_files): self.argparser.error( 'The number of grouping values must be equal to the number of CSV files being stacked.' ) else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = agate.writer(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.input_files): rows = agate.reader(f, **self.reader_kwargs) # If we have header rows, use them if not self.args.no_header_row: headers = next(rows, []) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) # If we don't generate simple column names based on first row else: row = next(rows, []) headers = make_default_headers(len(row)) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) if groups: row.insert(0, groups[i]) output.writerow(row) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row) f.close()
def main(self): # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error( 'You must specify a format when providing data via STDIN (pipe).' ) filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying a format with --format.' ) # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if self.args.sheet: kwargs['sheet'] = self.args.sheet if filetype == 'csv': kwargs.update(self.reader_kwargs) # Streaming CSV musn't set sniff_limit, but non-streaming should. if not self.args.no_inference: kwargs['sniff_limit'] = self.args.sniff_limit if self.args.no_header_row: kwargs['header'] = False elif self.args.no_inference: # Streaming CSV musn't set column_types, but other formats should. kwargs['column_types'] = agate.TypeTester(limit=0) # Convert the file. if filetype == 'csv' and self.args.no_inference: reader = agate.reader(self.input_file, **self.reader_kwargs) writer = agate.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write( fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError( 'DBF files can not be converted from stdin. You must pass a filename.' ) table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file)
def main(self): # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error('You must specify a format when providing data via STDIN (pipe).') filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if self.args.sheet: kwargs['sheet'] = self.args.sheet if filetype == 'csv': kwargs.update(self.reader_kwargs) # Streaming CSV musn't set sniff_limit, but non-streaming should. if not self.args.no_inference: kwargs['sniff_limit'] = self.args.sniff_limit if self.args.no_header_row: kwargs['header'] = False elif self.args.no_inference: # Streaming CSV musn't set column_types, but other formats should. kwargs['column_types'] = agate.TypeTester(limit=0) # Convert the file. if filetype == 'csv' and self.args.no_inference: reader = agate.reader(self.input_file, **self.reader_kwargs) writer = agate.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError('DBF files can not be converted from stdin. You must pass a filename.') table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file)
def main(self): reader = agate.reader(self.input_file, **self.reader_kwargs) writer = agate.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader)
def main(self): connection_string = self.args.connection_string do_insert = self.args.insert query = self.args.query self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if self.args.table_names: table_names = self.args.table_names.split(',') else: table_names = [] # If one or more filenames are specified, we need to add stdin ourselves (if available) if sys.stdin not in self.input_files: try: if not sys.stdin.isatty(): self.input_files.insert(0, sys.stdin) except: pass # Create an SQLite database in memory if no connection string is specified if query and not connection_string: connection_string = "sqlite:///:memory:" do_insert = True if self.args.dialect and connection_string: self.argparser.error( 'The --dialect option is only valid when --db is not specified.' ) if do_insert and not connection_string: self.argparser.error( 'The --insert option is only valid when --db is also specified.' ) if self.args.no_create and not do_insert: self.argparser.error( 'The --no-create option is only valid --insert is also specified.' ) # Establish database validity before reading CSV files if connection_string: try: engine, metadata = sql.get_connection(connection_string) except ImportError: raise ImportError( 'You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n' ) conn = engine.connect() trans = conn.begin() for f in self.input_files: try: # Try to use name specified via --table table_name = table_names.pop(0) except IndexError: if f == sys.stdin: table_name = "stdin" else: # Use filename as table name table_name = os.path.splitext(os.path.split(f.name)[1])[0] csv_table = table.Table.from_csv( f, name=table_name, sniff_limit=self.args.sniff_limit, blanks_as_nulls=(not self.args.blanks), infer_types=(not self.args.no_inference), no_header_row=self.args.no_header_row, **self.reader_kwargs) f.close() if csv_table: if connection_string: sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints, self.args.db_schema, metadata) # Create table if not self.args.no_create: sql_table.create() # Insert data if do_insert and csv_table.count_rows() > 0: insert = sql_table.insert() headers = csv_table.headers() conn.execute(insert, [ dict(zip(headers, row)) for row in csv_table.to_rows() ]) # Output SQL statements else: sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints) self.output_file.write( '%s\n' % sql.make_create_table_statement( sql_table, dialect=self.args.dialect)) if connection_string: if query: # Execute specified SQL queries queries = query.split(';') rows = None for q in queries: if q: rows = conn.execute(q) # Output result of last query as CSV try: output = agate.writer(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) except AttributeError: pass trans.commit() conn.close()
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to join.') if self.args.columns: join_column_names = self._parse_join_column_names(self.args.columns) if len(join_column_names) == 1: join_column_names = join_column_names * len(self.input_files) if len(join_column_names) != len(self.input_files): self.argparser.error('The number of join column names must match the number of files, or be a single column name that exists in all files.') if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns: self.argparser.error('You must provide join column names when performing an outer join.') if self.args.left_join and self.args.right_join: self.argparser.error('It is not valid to specify both a left and a right join.') tables = [] for f in self.input_files: tables.append(list(agate.reader(f, **self.reader_kwargs))) f.close() join_column_ids = [] if self.args.columns: for i, t in enumerate(tables): join_column_ids.append(match_column_identifier(t[0], join_column_names[i])) jointab = [] if self.args.left_join: # Left outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) elif self.args.right_join: # Right outer join jointab = tables[-1] remaining_tables = tables[:-1] remaining_tables.reverse() for i, t in enumerate(remaining_tables): jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1]) elif self.args.outer_join: # Full outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: if self.args.columns: # Inner join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: jointab = tables[0] # Sequential join for t in tables[1:]: jointab = join.sequential_join(jointab, t) output = agate.writer(self.output_file, **self.writer_kwargs) for row in jointab: output.writerow(row)