def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False book = load_workbook(f, use_iterators=True, data_only=True) sheets = book.get_sheet_names() fname = os.path.splitext(f.name)[0]; for name in sheets: outputfname = fname + "_" + name.replace(" ", "") + ".csv" of = open(outputfname,'w') sheet = book.get_sheet_by_name(name) if not streaming: output = six.StringIO() writer = CSVKitWriter(output) for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.value for c in row]) continue out_row = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() of.write(data) of.close() # Return empty string when streaming return ''
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return #Read in header and rows reader = CSVKitReader(self.input_file, **self.reader_kwargs) column_names = reader.next() if self.args.columns is None: grouped_columns_ids = [] else: grouped_columns_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) aggregations = [] try: for (fun, cols) in map(lambda (f, cols): ( f, parse_column_identifiers(cols, column_names, self.args.zero_based)), self.args.aggregations): for col in cols: aggregations.append(aggregate_functions[fun](col)) except KeyError: self.argparser.error("Wrong aggregator function. Available: " + ', '.join(aggregate_functions.keys())) #Determine columns to group by, default to all columns #Write the output output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in group_rows(column_names, reader, grouped_columns_ids, aggregations): output.writerow(row)
def log_errors(self, rows): """ Log any errors to a csv file """ # Make sure the log directory exists os.path.exists(self.log_dir) or os.makedirs(self.log_dir) # Log writer log_path = os.path.join( self.log_dir, self.file_name.lower().replace("tsv", "errors.csv") ) log_file = open(log_path, 'w') log_writer = CSVKitWriter(log_file, quoting=csv.QUOTE_ALL) # Add the headers log_writer.writerow([ 'Line number', 'Headers len', 'Fields len', 'Line value' ]) # Log out the rows log_writer.writerows(rows) # Shut it down log_file.close()
def main(self): try: engine, metadata = sql.get_connection(self.args.connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') conn = engine.connect() if self.args.query: query = self.args.query.strip() else: query = "" for line in self.args.file: query += line rows = conn.execute(query) output = CSVKitWriter(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) conn.close()
def main(self): if self.args.names_only: print_column_names(self.args.file, self.output_file, **self.reader_kwargs) return if self.args.file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext( os.path.split(self.args.file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv(self.args.file, name=table_name, snifflimit=self.args.snifflimit, **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, tab.headers()) rows = tab.to_rows(serialize_dates=True) rows.sort(key=lambda r: [r[c] for c in column_ids], reverse=self.args.reverse) rows.insert(0, tab.headers()) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if self.input_file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext( os.path.split(self.input_file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv(self.input_file, name=table_name, snifflimit=self.args.snifflimit, no_header_row=self.args.no_header_row, infer_types=(not self.args.no_inference), **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based) rows = tab.to_rows(serialize_dates=True) sorter = lambda r: [ r[c] if r[c] is not None else '' for c in column_ids ] rows.sort(key=sorter, reverse=self.args.reverse) rows.insert(0, tab.headers()) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.regex and not self.args.pattern and not self.args.matchfile: self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.") rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for i, row in enumerate(filter_reader): output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if self.input_file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext(os.path.split(self.input_file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv( self.input_file, name=table_name, snifflimit=self.args.snifflimit, no_header_row=self.args.no_header_row, infer_types=(not self.args.no_inference), **self.reader_kwargs ) column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based) rows = tab.to_rows(serialize_dates=True) sorter = lambda r: [r[c] if r[c] is not None else '' for c in column_ids] rows.sort(key=sorter, reverse=self.args.reverse) rows.insert(0, tab.headers()) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def fixed2csv(f, schema, output=None, **kwargs): """ Convert a fixed-width file to csv using a CSV-formatted schema description. A schema CSV must start with a header row with (at least) columns labeled "column","start", and "length". (Other columns will be ignored.) For each subsequent row, therefore, those columns will be used to identify a column name, the starting index of the column (an integer), and the length of the column (also an integer). Values in the 'start' column are assumed to be zero-based, unless the first value for 'start' is 1, in which case all values are assumed to be one-based. If output is specified, rows will be written to that object, otherwise the complete data will be returned. """ streaming = True if output else False if not streaming: output = six.StringIO() try: encoding = kwargs['encoding'] except KeyError: encoding = None writer = CSVKitWriter(output) reader = FixedWidthReader(f, schema, encoding=encoding) writer.writerows(reader) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def main(self): if len(self.args.files) < 2: self.argparser.error('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.args.files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.args.files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.args.files): rows = CSVKitReader(f, **self.reader_kwargs) headers = rows.next() if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row)
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.input_files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.input_files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.input_files): rows = CSVKitReader(f, **self.reader_kwargs) # If we have header rows, use them if not self.args.no_header_row: headers = next(rows, []) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) # If we don't generate simple column names based on first row else: row = next(rows, []) headers = make_default_headers(len(row)) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) if groups: row.insert(0, groups[i]) output.writerow(row) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row) f.close()
def main(self): reader = CSVKitReader(self.args.file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errors: for e in checker.errors: self.output_file.write("Line %i: %s\n" % (e.line_number, e.msg)) else: self.output_file.write("No errors.\n") if checker.joins: self.output_file.write( "%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins) ) else: base, ext = splitext(self.args.file.name) with open("%s_out.csv" % base, "w") as f: clean_writer = CSVKitWriter(f, **self.writer_kwargs) checker = RowChecker(reader) clean_writer.writerow(checker.column_names) for row in checker.checked_rows(): clean_writer.writerow(row) if checker.errors: error_filename = "%s_err.csv" % base with open(error_filename, "w") as f: error_writer = CSVKitWriter(f, **self.writer_kwargs) error_header = ["line_number", "msg"] error_header.extend(checker.column_names) error_writer.writerow(error_header) error_count = len(checker.errors) for e in checker.errors: error_writer.writerow(self._format_error_row(e)) self.output_file.write( "%i error%s logged to %s\n" % (error_count, "" if error_count == 1 else "s", error_filename) ) else: self.output_file.write("No errors.\n") if checker.joins: self.output_file.write( "%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins) )
def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False if not streaming: output = StringIO() writer = CSVKitWriter(output) book = load_workbook(f, use_iterators=True, data_only=True) if 'sheet' in kwargs: sheet = book.get_sheet_by_name(kwargs['sheet']) else: sheet = book.get_active_sheet() for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.value for c in row]) continue out_row = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def main(self): reader = CSVKitReader(self.args.file, **self.reader_kwargs) cnames = reader.next() cids = parse_column_identifiers(self.args.columns, cnames, self.args.zero_based) mods = {idx: self.args.expr for idx in cids} output = CSVKitWriter(self.output_file, **self.writer_kwargs) reader = sed.CsvFilter(reader, mods, header=False) output.writerow(cnames) for row in reader: output.writerow(row)
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.input_files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.input_files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.input_files): rows = CSVKitReader(f, **self.reader_kwargs) #headers = next(rows, []) # If we have header rows, use them if not self.args.no_header_row: headers = next(rows, []) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) # If we don't generate simple column names based on first row else: row = next(rows, []) headers = make_default_headers(len(row)) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) if groups: row.insert(0, groups[i]) output.writerow(row) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row) f.close()
def to_csv(self, output, **kwargs): """ Serializes the table to CSV and writes it to any file-like object. """ rows = self.to_rows(serialize_dates=True) # Insert header row rows.insert(0, self.headers()) writer = CSVKitWriter(output, **kwargs) writer.writerows(rows)
def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False if not streaming: output = StringIO() writer = CSVKitWriter(output) book = load_workbook(f, use_iterators=True) sheet = book.get_active_sheet() for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.internal_value for c in row]) continue out_row = [] for c in row: value = c.internal_value if value.__class__ is datetime.datetime: if value.time() != NULL_TIME: value = normalize_datetime(value) else: value = value.date() elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() if value == "\0": continue out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ""
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.script: self.argparser.error("At least one script -s must be defined.") rows = CSVKitReader(self.args.file, **self.reader_kwargs) output = CSVKitWriter(self.output_file, **self.writer_kwargs) script_reader = ScriptCSVReader(rows, scripts=self.args.script, zero_based=self.args.zero_based) for i, row in enumerate(script_reader): output.writerow(row)
def main(self): rows = CSVKitReader(self.args.file, **self.reader_kwargs) if self.args.no_header_row: row = rows.next() column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = rows.next() column_names = self.args.columns.split(',') part_count = 0 output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs) output.writerow(column_names) count = 0 for row in rows: if (self.args.lines > 0) and (count == self.args.lines): part_count += 1 count = 0 # couldn't find a better way to close the file del output output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs) output.writerow(column_names) output.writerow(row) count += 1
def json2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ document = f.read() js = json.loads(document) if isinstance(js, dict): if not key: raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.') js = js[key] if not isinstance(js, list): raise TypeError('Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).') field_set = set() flat = [] for obj in js: flat.append(parse_object(obj)) for obj in flat: field_set.update(obj.keys()) fields = sorted(list(field_set)) o = six.StringIO() writer = CSVKitWriter(o, *kwargs) writer.writerow(fields) for i in flat: row = [] for field in fields: if field in i: row.append(i[field]) else: row.append(None) writer.writerow(row) output = o.getvalue() o.close() return output
def main(self): tabname = os.path.splitext( os.path.basename(self.args.file._lazy_args[0]))[0] tab = table.Table.from_csv(self.args.file, name=tabname, **self.reader_kwargs) stmt = make_create_table_statement(make_table(tab), dialect='sqlite') conn = sqlite3.connect(':memory:') c = conn.cursor() c.execute(stmt) for row in tab.to_rows(): vals = ','.join(['?'] * len(row)) prepared = "INSERT INTO %s VALUES(%s)" % (tab.name, vals) c.execute(prepared, row) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in c.execute(self.args.query): output.writerow(row)
def sheet2csv(sheet, output=None): streaming = True if output else False if not streaming: output = six.StringIO() writer = CSVKitWriter(output) for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.value for c in row]) continue out_row = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def json2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ js = json.load(f, object_pairs_hook=OrderedDict) if isinstance(js, dict): if not key: raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.') js = js[key] if not isinstance(js, list): raise TypeError('Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).') fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for obj in js: for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) # write header output.writerow([column_names[c] for c in column_ids]) def float_or_else(x): try: return float(x) except ValueError: return x if self.args.filter_expr: for row in rows: d = {i:float_or_else(j) for i,j in zip(column_names,row)} if eval(self.args.filter_expr,d): out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row) elif self.args.not_filter_expr: for row in rows: d = {i:float_or_else(j) for i,j in zip(column_names,row)} if not eval(self.args.not_filter_expr,d): out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row)
def json2csv(f, field=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ js = json.load(f, object_pairs_hook=OrderedDict) if isinstance(js, dict): if not field: raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.') js = js[field] fields = [] flats = [] for obj in js: (flat_fields, flat_key_values_dict) = parse_object(obj) flats.append(flat_key_values_dict) for field in flat_fields: if field not in fields: fields.append(field) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flats: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def download_to_csv(event_id): f = StringIO() writer = CSVKitWriter(f) # List headers event = Event.query.get(event_id) headers = [] for question in event.questions: headers.append(question.label) writer.writerow(headers) # List entries for each registration for registration in event.registrations: data = [] for answer in registration.answers: data.append(answer.value) writer.writerow(data) return Response(f.getvalue(), mimetype='text/csv')
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for i, row in enumerate(rows): out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error('You must specify at least one column to search using the -c option.') if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.') rows = CSVKitReader(self.input_file, **self.reader_kwargs) column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for row in filter_reader: output.writerow(row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) uniq_column_id = parse_column_identifiers(self.args.uniq_column, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) d = set() # cache for used-rows # use tuple as keys for cache cache_key = lambda row: tuple([row[i] for i in uniq_column_id]) for row in rows: if cache_key(row) in d: continue d.update([cache_key(row)]) out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) drop_white = lambda i: re.sub('\s+$', '', re.sub('^\s+', '', i)) for row in rows: out_row = [ drop_white(row[c]) if c < len(row) else None for c in column_ids ] output.writerow(out_row)
def ndjson2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. Supports both JSON and "Newline-delimited JSON". The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ first_line = f.readline() first_row = json.loads(first_line, object_pairs_hook=OrderedDict) js = itertools.chain((first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f)) fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def main(self): if len(self.args.files) < 2: sys.exit('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.args.files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.args.files): sys.exit( 'The number of grouping values must be equal to the number of CSV files being stacked.' ) else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.args.files): rows = CSVKitReader(f, **self.reader_kwargs) headers = rows.next() if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row)
def main(self): reader = CSVKitReader(self.args.file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errs: for e in checker.errs: self.output_file.write("Line %i: %s\n" % (e.line_number,e.msg)) else: self.output_file.write("No errors.\n") if checker.joins: self.output_file.write("%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins)) else: base,ext = splitext(self.args.file.name) # should we preserve delimiters and other dialect args from CLI? cleaned_file = CSVKitWriter(open("%s_out.csv" % base,"w"), **self.writer_kwargs) checker = RowChecker(reader) cleaned_file.writerow(checker.column_names) for row in checker.checked_rows(): cleaned_file.writerow(row) if checker.errs: # should we preserve delimiters and other dialect args from CLI? err_filename = "%s_err.csv" % base err_file = CSVKitWriter(open(err_filename, "w"), **self.writer_kwargs) err_header = ['line_number','msg'] err_header.extend(checker.column_names) err_file.writerow(err_header) for e in checker.errs: err_file.writerow(self._format_error_row(e)) err_count = len(checker.errs) self.output_file.write("%i error%s logged to %s\n" % (err_count,"" if err_count == 1 else "s", err_filename)) else: self.output_file.write("No errors.\n") if checker.joins: self.output_file.write("%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins))
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) all_column_ids = parse_column_identifiers(None,column_names, self.args.zero_based, self.args.not_columns) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in all_column_ids]) d = {} # namespace dict for map_expr exec "def f(x): return %s"%(self.args.map_expr) in d for row in rows: out_row = [] for c in all_column_ids: if c in column_ids: out_row.append(d['f'](row[c]) if c <len(row) else None) else: out_row.append(row[c] if c <len(row) else None) output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.regex and not self.args.pattern and not self.args.matchfile: self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.") rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = [line.rstrip() for line in self.args.matchfile] pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for i, row in enumerate(filter_reader): output.writerow(row)
def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False if not streaming: output = six.StringIO() writer = CSVKitWriter(output) book = load_workbook(f, use_iterators=True, data_only=True) if 'sheet' in kwargs: sheet = book.get_sheet_by_name(kwargs['sheet']) else: sheet = book.get_active_sheet() for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.value for c in row]) continue out_row = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date( 1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def run(*args): avibase_index = 'http://avibase.bsc-eoc.org/checklist.jsp?list=eBird' list_name = args[0] language = synonyms[args[1]] response = urllib2.urlopen(avibase_index) html = lxml.html.fromstring(response.read()) links = html.cssselect('.AVBregions td a') names = [] for link in links: if link.text == list_name: checklist_url = 'http://avibase.bsc-eoc.org/' + link.attrib['href'] if language != 'EN': checklist_url += '&synlang=%s' % language response = urllib2.urlopen(checklist_url) html = lxml.html.fromstring(response.read()) for row in html.cssselect('.AVBlist tr.highlight1'): cells = row.cssselect('td') scientific_name = cells[1].cssselect('i')[0].text if language == 'EN': common_name = cells[0].text else: common_name = cells[2].text if common_name: common_name = common_name.encode('latin_1').decode('utf-8') else: common_name = scientific_name names.append((scientific_name, common_name)) suffix = language.lower() with open('species_names_%s.csv' % suffix, 'wb') as fp: writer = CSVKitWriter(fp) writer.writerow(('scientific_name', 'common_name_%s' % suffix)) for name in names: writer.writerow(name)
def main(self): try: engine, metadata = sql.get_connection(self.args.connection_string) except ImportError: raise ImportError( 'You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n' ) conn = engine.connect() if self.args.query: query = self.args.query.strip() else: query = "" for line in self.args.file: query += line rows = conn.execute(query) output = CSVKitWriter(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) conn.close()
def handle(self, *args, **options): self.cursor = connection.cursor() sql = """ SELECT DISTINCT o.name, o.seat, f.filer_id_raw, f.xref_filer_id, f.name, f.party FROM %(candidate)s as c INNER JOIN %(office)s as o ON c.office_id = o.id INNER JOIN %(filer)s as f ON c.filer_id = f.id """ % dict( candidate=models.Candidate._meta.db_table, office=models.Office._meta.db_table, filer=models.Filer._meta.db_table, ) self.cursor.execute(sql) writer = CSVKitWriter(open("./candidates.csv", 'wb')) writer.writerow([ 'office_name', 'office_seat', 'filer_id', 'xref_filer_id', 'name', 'party' ]) writer.writerows(self.cursor.fetchall())
def geojson2csv(f, key=None, **kwargs): """ Convert a GeoJSON document into CSV format. """ js = json.load(f, object_pairs_hook=OrderedDict) if not isinstance(js, dict): raise TypeError('JSON document is not valid GeoJSON: Root element is not an object.') if 'type' not in js: raise TypeError('JSON document is not valid GeoJSON: No top-level "type" key.') if js['type'] != 'FeatureCollection': raise TypeError('Only GeoJSON with root FeatureCollection type is supported. Not %s' % js['type']) if 'features' not in js: raise TypeError('JSON document is not a valid FeatureCollection: No top-level "features" key.') features = js['features'] features_parsed = [] # tuples in the format (id, properties, geometry) property_fields = [] for feature in features: geoid = feature.get('id', None) properties = feature.get('properties') or {} for prop in properties.keys(): if prop not in property_fields: property_fields.append(prop) geometry = json.dumps(feature['geometry']) features_parsed.append((geoid, properties, geometry)) header = ['id'] header.extend(property_fields) header.append('geojson') o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(header) for geoid, properties, geometry in features_parsed: row = [geoid] for field in property_fields: row.append(properties.get(field, None)) row.append(geometry) writer.writerow(row) output = o.getvalue() o.close() return output
def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False if not streaming: output = StringIO() writer = CSVKitWriter(output) book = load_workbook(f, use_iterators=True) sheet = book.get_active_sheet() for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.internal_value for c in row]) continue out_row = [] for c in row: value = c.internal_value if value.__class__ is datetime.datetime: if value.time() != NULL_TIME: value = normalize_datetime(value) else: value = value.date() elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() if value == "\0": continue out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def json2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ document = f.read() js = json.loads(document) if isinstance(js, dict): if not key: raise TypeError( 'When converting a JSON document with a top-level dictionary element, a key must be specified.' ) js = js[key] if not isinstance(js, list): raise TypeError( 'Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).' ) field_set = set() flat = [] for obj in js: flat.append(parse_object(obj)) for obj in flat: field_set.update(obj.keys()) fields = sorted(list(field_set)) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: if field in i: row.append(i[field]) else: row.append(None) writer.writerow(row) output = o.getvalue() o.close() return output
def json2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ js = json.load(f, object_pairs_hook=OrderedDict) if isinstance(js, dict): if not key: raise TypeError( 'When converting a JSON document with a top-level dictionary element, a key must be specified.' ) js = js[key] if not isinstance(js, list): raise TypeError( 'Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).' ) fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for obj in js: for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) # write header output.writerow([column_names[c] for c in column_ids]) def float_or_else(x): try: return float(x) except ValueError: return x if self.args.filter_expr: for row in rows: d = {i: float_or_else(j) for i, j in zip(column_names, row)} if eval(self.args.filter_expr, d): out_row = [ row[c] if c < len(row) else None for c in column_ids ] output.writerow(out_row) elif self.args.not_filter_expr: for row in rows: d = {i: float_or_else(j) for i, j in zip(column_names, row)} if not eval(self.args.not_filter_expr, d): out_row = [ row[c] if c < len(row) else None for c in column_ids ] output.writerow(out_row)
def writeCsv(self, filename=None): rows = self.getRows() if filename == None: filename = self.name + '.csv' # Output result of last query as CSV row_count = 0 with open(filename, 'wb') as out: output = CSVKitWriter(out) output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) row_count += 1 tool.VERBOSE('wrote {} row(s) to csv {}', row_count, filename)
def ndjson2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. Supports both JSON and "Newline-delimited JSON". The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ first_line = f.readline() first_row = json.loads(first_line, object_pairs_hook=OrderedDict) js = itertools.chain( (first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f)) fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output
def download_to_csv(event_id): """Downloads the registration data to a CSV""" f = StringIO() writer = CSVKitWriter(f) # List headers event = Event.query.get(event_id) headers = [] for question in event.questions: headers.append(question.label) writer.writerow(headers) # List entries for each registration for registration in event.registrations: data = [] sorted_answers = sorted(registration.answers, key=lambda a: a.question.ordinal) for answer in sorted_answers: data.append(answer.value) writer.writerow(data) return Response(f.getvalue(), mimetype='text/csv')
def clean(self, name): """ Cleans the provided source TSV file and writes it out in CSV format """ if self.verbosity > 2: self.log(" Cleaning %s" % name) # Up the CSV data limit csv.field_size_limit(1000000000) # Input and output paths tsv_path = os.path.join(self.tsv_dir, name) csv_path = os.path.join(self.csv_dir, name.lower().replace("tsv", "csv")) # Writer csv_file = open(csv_path, 'w') csv_writer = CSVKitWriter(csv_file, quoting=csv.QUOTE_ALL) # Reader tsv_file = open(tsv_path, 'rb') # Pull and clean the headers try: headers = tsv_file.readline() except StopIteration: return headers = headers.decode("ascii", "replace") headers_csv = CSVKitReader(StringIO(headers), delimiter=str('\t')) try: headers_list = next(headers_csv) except StopIteration: return headers_count = len(headers_list) csv_writer.writerow(headers_list) log_rows = [] # Loop through the rest of the data line_number = 1 for tsv_line in tsv_file: # Goofing around with the encoding while we're in there. tsv_line = tsv_line.decode("ascii", "replace") if six.PY2: tsv_line = tsv_line.replace('\ufffd', '?') # Nuke any null bytes null_bytes = tsv_line.count('\x00') if null_bytes: tsv_line = tsv_line.replace('\x00', ' ') # Nuke ASCII 26 char, the "substitute character" # or chr(26) in python sub_char = tsv_line.count('\x1a') if sub_char: tsv_line = tsv_line.replace('\x1a', '') # Split on tabs so we can later spit it back out as CSV # and remove extra newlines while we are there. csv_field_list = tsv_line.replace("\r\n", "").split("\t") # Check if our values line up with our headers # and if not, see if CSVkit can sort out the problems if not len(csv_field_list) == headers_count: csv_field_list = next( CSVKitReader(StringIO(tsv_line), delimiter=str('\t'))) if not len(csv_field_list) == headers_count: if self.verbosity > 2: msg = ' Bad parse of line %s (%s headers, %s values)' self.failure(msg % (line_number, len(headers_list), len(csv_field_list))) log_rows.append([ line_number, len(headers_list), len(csv_field_list), ','.join(csv_field_list) ]) continue # Write out the row csv_writer.writerow(csv_field_list) line_number += 1 # Log errors if there are any if log_rows: if self.verbosity > 1: msg = ' %s errors' self.failure(msg % (len(log_rows) - 1)) self.log_errors(name, log_rows) # Shut it down tsv_file.close() csv_file.close()
def main(self): reader = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errors: for e in checker.errors: self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write( '%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins)) else: base, ext = splitext(self.input_file.name) with open('%s_out.csv' % base, 'w') as f: clean_writer = CSVKitWriter(f, **self.writer_kwargs) checker = RowChecker(reader) clean_writer.writerow(checker.column_names) for row in checker.checked_rows(): clean_writer.writerow(row) if checker.errors: error_filename = '%s_err.csv' % base with open(error_filename, 'w') as f: error_writer = CSVKitWriter(f, **self.writer_kwargs) error_header = ['line_number', 'msg'] error_header.extend(checker.column_names) error_writer.writerow(error_header) error_count = len(checker.errors) for e in checker.errors: error_writer.writerow(self._format_error_row(e)) self.output_file.write( '%i error%s logged to %s\n' % (error_count, '' if error_count == 1 else 's', error_filename)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write( '%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins))
def main(self): reader = CSVKitReader(self.args.file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errs: for e in checker.errs: self.output_file.write("Line %i: %s\n" % (e.line_number, e.msg)) else: self.output_file.write("No errors.\n") if checker.joins: self.output_file.write( "%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins)) else: base, ext = splitext(self.args.file.name) # should we preserve delimiters and other dialect args from CLI? cleaned_file = CSVKitWriter(open("%s_out.csv" % base, "w"), **self.writer_kwargs) checker = RowChecker(reader) cleaned_file.writerow(checker.column_names) for row in checker.checked_rows(): cleaned_file.writerow(row) if checker.errs: # should we preserve delimiters and other dialect args from CLI? err_filename = "%s_err.csv" % base err_file = CSVKitWriter(open(err_filename, "w"), **self.writer_kwargs) err_header = ['line_number', 'msg'] err_header.extend(checker.column_names) err_file.writerow(err_header) for e in checker.errs: err_file.writerow(self._format_error_row(e)) err_count = len(checker.errs) self.output_file.write( "%i error%s logged to %s\n" % (err_count, "" if err_count == 1 else "s", err_filename)) else: self.output_file.write("No errors.\n") if checker.joins: self.output_file.write( "%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins))