def main(self): if self.args.names_only: self.print_column_names() return #Read in header and rows reader = CSVKitReader(self.input_file, **self.reader_kwargs) column_names = reader.next() if self.args.columns is None: grouped_columns_ids = [] else: grouped_columns_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) aggregations = [] try: for (fun, cols) in map(lambda (f, cols): ( f, parse_column_identifiers(cols, column_names, self.args.zero_based)), self.args.aggregations): for col in cols: aggregations.append(aggregate_functions[fun](col)) except KeyError: self.argparser.error("Wrong aggregator function. Available: " + ', '.join(aggregate_functions.keys())) #Determine columns to group by, default to all columns #Write the output output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in group_rows(column_names, reader, grouped_columns_ids, aggregations): output.writerow(row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) all_column_ids = parse_column_identifiers(None,column_names, self.args.zero_based, self.args.not_columns) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in all_column_ids]) d = {} # namespace dict for map_expr exec "def f(x): return %s"%(self.args.map_expr) in d for row in rows: out_row = [] for c in all_column_ids: if c in column_ids: out_row.append(d['f'](row[c]) if c <len(row) else None) else: out_row.append(row[c] if c <len(row) else None) output.writerow(out_row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) uniq_column_id = parse_column_identifiers(self.args.uniq_column, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) d = set() # cache for used-rows # use tuple as keys for cache cache_key = lambda row: tuple([row[i] for i in uniq_column_id]) for row in rows: if cache_key(row) in d: continue d.update([cache_key(row)]) out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row)
def test_range_notation_open_ended(self): self.assertEqual([0,1,2], parse_column_identifiers(':3', self.headers)) target = range(3,len(self.headers)) # protect against devs adding to self.headers target.insert(0,0) self.assertEqual(target, parse_column_identifiers('1,4:', self.headers)) self.assertEqual(range(0,len(self.headers)), parse_column_identifiers('1:', self.headers))
def test_parse_column_identifiers(self): self.assertEqual([2, 0, 1], parse_column_identifiers('i_work_here,1,name', self.headers)) self.assertEqual([2, 1, 1], parse_column_identifiers('i_work_here,1,name', self.headers, column_offset=0))
def test_parse_column_identifiers(self): self.assertEqual([2, 0, 1], parse_column_identifiers(' i_work_here, 1,name ', self.headers)) self.assertEqual([2, 1, 1], parse_column_identifiers(' i_work_here, 1,name ', self.headers, zero_based=True))
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniffer.sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) f = six.StringIO(contents) rows = CSVKitReader(f, **kwargs) if no_header_row: # Peek at a row to infer column names from row = next(rows) headers = make_default_headers(len(row)) column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] data_columns = [[] for c in headers] # Put row back on top rows = itertools.chain([row], rows) else: headers = next(rows) if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] for i, row in enumerate(rows): for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name)
def test_range_notation_open_ended(self): self.assertEqual([0, 1, 2], parse_column_identifiers(':3', self.headers)) target = range(3, len(self.headers) - 1) # protect against devs adding to self.headers target.insert(0, 0) self.assertEqual(target, parse_column_identifiers('1,4:', self.headers))
def test_range_notation(self): self.assertEqual([0, 1, 2], parse_column_identifiers('1:3', self.headers)) self.assertEqual([1, 2, 3], parse_column_identifiers('2-4', self.headers)) self.assertEqual([0, 1, 2, 3], parse_column_identifiers('1,2:4', self.headers)) self.assertEqual([4, 2, 5], parse_column_identifiers('more-header-values,3,stuff', self.headers))
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): if self.args.names_only: print_column_names(self.args.file, self.output_file, **self.reader_kwargs) return if self.args.file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext( os.path.split(self.args.file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv(self.args.file, name=table_name, snifflimit=self.args.snifflimit, **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, tab.headers()) rows = tab.to_rows(serialize_dates=True) rows.sort(key=lambda r: [r[c] for c in column_ids], reverse=self.args.reverse) rows.insert(0, tab.headers()) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if self.input_file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext(os.path.split(self.input_file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv( self.input_file, name=table_name, snifflimit=self.args.snifflimit, no_header_row=self.args.no_header_row, infer_types=(not self.args.no_inference), **self.reader_kwargs ) column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based) rows = tab.to_rows(serialize_dates=True) sorter = lambda r: [(r[c] is not None, r[c]) for c in column_ids] rows.sort(key=sorter, reverse=self.args.reverse) rows.insert(0, tab.headers()) output = agate.writer(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.regex and not self.args.pattern and not self.args.matchfile: self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.") rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for i, row in enumerate(filter_reader): output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error('You must specify at least one column to search using the -c option.') if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.') rows = CSVKitReader(self.input_file, **self.reader_kwargs) column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for row in filter_reader: output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if self.input_file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext( os.path.split(self.input_file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv(self.input_file, name=table_name, snifflimit=self.args.snifflimit, no_header_row=self.args.no_header_row, infer_types=(not self.args.no_inference), **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based) rows = tab.to_rows(serialize_dates=True) sorter = lambda r: [ r[c] if r[c] is not None else '' for c in column_ids ] rows.sort(key=sorter, reverse=self.args.reverse) rows.insert(0, tab.headers()) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error('You must specify at least one column to search using the -c option.') if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.') rows = agate.reader(self.input_file, **self.reader_kwargs) column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for row in filter_reader: output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.regex and not self.args.pattern and not self.args.matchfile: self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.") rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = [line.rstrip() for line in self.args.matchfile] pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for i, row in enumerate(filter_reader): output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return rows = agate.reader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) # write header output.writerow([column_names[c] for c in column_ids]) def float_or_else(x): try: return float(x) except ValueError: return x if self.args.filter_expr: for row in rows: d = {i:float_or_else(j) for i,j in zip(column_names,row)} if eval(self.args.filter_expr,d): out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row) elif self.args.not_filter_expr: for row in rows: d = {i:float_or_else(j) for i,j in zip(column_names,row)} if not eval(self.args.not_filter_expr,d): out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) drop_white = lambda i: re.sub('\s+$', '', re.sub('^\s+', '', i)) for row in rows: out_row = [ drop_white(row[c]) if c < len(row) else None for c in column_ids ] output.writerow(out_row)
def test_range_notation(self): self.assertEqual([0,1,2], parse_column_identifiers('1:3', self.headers)) self.assertEqual([1,2,3], parse_column_identifiers('1:3', self.headers, zero_based=True)) self.assertEqual([1,2,3], parse_column_identifiers('2-4', self.headers)) self.assertEqual([2,3,4], parse_column_identifiers('2-4', self.headers, zero_based=True)) self.assertEqual([0,1,2,3], parse_column_identifiers('1,2:4', self.headers)) self.assertEqual([1,2,3,4], parse_column_identifiers('1,2:4', self.headers, zero_based=True)) self.assertEqual([4,2,5], parse_column_identifiers('more-header-values,3,stuff', self.headers)) self.assertEqual([4,3,5], parse_column_identifiers('more-header-values,3,stuff', self.headers,zero_based=True))
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() if snifflimit: sample = contents[:snifflimit] else: sample = contents dialect = sniffer.sniff_dialect(sample) f = StringIO(contents) reader = CSVKitReader(f, dialect=dialect, **kwargs) headers = reader.next() if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] for row in reader: for i, d in enumerate(row): try: data_columns[i].append(row[column_ids[i]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append( Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls)) return Table(columns, name=name)
def main(self): if self.args.names_only: self.print_column_names() return table = agate.Table.from_csv(self.input_file, sniff_limit=self.args.sniff_limit, header=not self.args.no_header_row, column_types=self.get_column_types(), **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, table.column_names, column_offset=self.get_column_offset()) table = table.order_by(column_ids, reverse=self.args.reverse) table.to_csv(self.output_file, **self.writer_kwargs)
def main(self): reader = CSVKitReader(self.args.file, **self.reader_kwargs) cnames = reader.next() cids = parse_column_identifiers(self.args.columns, cnames, self.args.zero_based) mods = {idx: self.args.expr for idx in cids} output = CSVKitWriter(self.output_file, **self.writer_kwargs) reader = sed.CsvFilter(reader, mods, header=False) output.writerow(cnames) for row in reader: output.writerow(row)
def main(self): reader = CSVKitReader(self.args.file, **self.reader_kwargs) cnames = reader.next() cids = parse_column_identifiers(self.args.columns, cnames, self.args.zero_based) mods = {idx: self.args.expr for idx in cids} output = CSVKitWriter(self.output_file, **self.writer_kwargs) reader = sed.CsvFilter(reader, mods, header=False) output.writerow(cnames) for row in reader: output.writerow(row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None,column_names, self.args.zero_based) uniq_column_id = parse_column_identifiers(self.args.uniq_column, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) d = set() # cache for used-rows # use tuple as keys for cache cache_key = lambda row: tuple([row[i] for i in uniq_column_id]) for row in rows: if cache_key(row) in d: continue d.update([ cache_key(row) ]) out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) dn_att_id = parse_column_identifiers(self.args.uid, column_names, self.args.zero_based, self.args.not_columns) output = LDIFWriter(self.output_file) #output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue zipped_row = zip(column_names, map(lambda x: [x], out_row)) zipped_row = list(v for v in zipped_row if v[1] != None and v[1][0] != None and v[1][0] != '') dn = self.args.uid + "=" + out_row[dn_att_id[0]] + "," + self.args.basedn output.unparse(dn, zipped_row)
def main(self): if self.args.names_only: self.print_column_names() return if self.args.no_inference: column_types = agate.TypeTester(limit=0) else: column_types = None table = agate.Table.from_csv(self.input_file, sniff_limit=self.args.sniff_limit, header=not self.args.no_header_row, column_types=column_types, **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, table.column_names, self.args.zero_based) table = table.order_by(lambda row: [(row[column_id] is not None, row[column_id]) for column_id in column_ids], reverse=self.args.reverse) table.to_csv(self.output_file, **self.writer_kwargs)
def main(self): if self.args.names_only: self.print_column_names() return # Otherwise, fails with "io.UnsupportedOperation: underlying stream is not seekable". if self.input_file == sys.stdin: # We can't sort without reading the entire input. self.input_file = six.StringIO(sys.stdin.read()) table = agate.Table.from_csv(self.input_file, sniff_limit=self.args.sniff_limit, header=not self.args.no_header_row, column_types=self.get_column_types(), **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, table.column_names, column_offset=self.get_column_offset()) table = table.order_by(column_ids, reverse=self.args.reverse) table.to_csv(self.output_file, **self.writer_kwargs)
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, type_inference=True, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() if snifflimit: sample = contents[:snifflimit] else: sample = contents dialect = sniffer.sniff_dialect(sample) normal_type = kwargs.pop("normal_type", InvalidType) f = StringIO(contents) reader = CSVKitReader(f, dialect=dialect, **kwargs) headers = reader.next() if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] for row in reader: for i, d in enumerate(row): try: data_columns[i].append(row[column_ids[i]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, type_inference=type_inference, normal_type=normal_type)) return Table(columns, name=name)
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for i, row in enumerate(rows): out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return if self.additional_input_expected(): self.argparser.error( 'You must provide an input file or piped data.') table = agate.Table.from_csv(self.input_file, skip_lines=self.args.skip_lines, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types(), **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, table.column_names, self.get_column_offset()) table = table.order_by(column_ids, reverse=self.args.reverse) table.to_csv(self.output_file, **self.writer_kwargs)
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) columns_type = self.parse_column_types(self.args.columns_types, column_ids) column_names = map(lambda x: column_names[x],column_ids) output = self.output_file for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if ''.join(out_row) == '': continue insert_stat = "INSERT INTO " + self.args.tablename + "(" insert_stat += ",".join(column_names) insert_stat += ") VALUES (" insert_stat += ",".join( map(lambda colid: columns_type[colid](row[colid]) , column_ids)) insert_stat += ");" output.write('%s\n' % insert_stat)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) # write header output.writerow([column_names[c] for c in column_ids]) def float_or_else(x): try: return float(x) except ValueError: return x if self.args.filter_expr: for row in rows: d = {i: float_or_else(j) for i, j in zip(column_names, row)} if eval(self.args.filter_expr, d): out_row = [ row[c] if c < len(row) else None for c in column_ids ] output.writerow(out_row) elif self.args.not_filter_expr: for row in rows: d = {i: float_or_else(j) for i, j in zip(column_names, row)} if not eval(self.args.not_filter_expr, d): out_row = [ row[c] if c < len(row) else None for c in column_ids ] output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.args.file, **self.reader_kwargs) column_names = rows.next() column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for i, row in enumerate(rows): out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) drop_white = lambda i:re.sub('\s+$','',re.sub('^\s+','',i)) for row in rows: out_row = [drop_white(row[c]) if c < len(row) else None for c in column_ids] output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return if self.args.file.name != '<stdin>': # Use filename as table name table_name = os.path.splitext(os.path.split(self.args.file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv(self.args.file, name=table_name, snifflimit=self.args.snifflimit, **self.reader_kwargs) column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based) rows = tab.to_rows(serialize_dates=True) rows.sort(key=lambda r: [r[c] for c in column_ids], reverse=self.args.reverse) rows.insert(0, tab.headers()) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) # Make Headers if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) import sys # Project Column Names target_names = self.args.renames.split(',') source_column_ids = parse_column_identifiers(self.args.sources, column_names, zero_based=self.args.zero_based) assert len(target_names) == len(source_column_ids) and "Input sources and rename columns must be the same length!" output = CSVKitWriter(self.output_file, **self.writer_kwargs) # print header from target output.writerow(target_names) # Rewrite Rows for row in rows: out_row = [row[c] if c < len(row) else None for c in source_column_ids] output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return if self.additional_input_expected(): self.argparser.error('You must provide an input file or piped data.') table = agate.Table.from_csv( self.input_file, skip_lines=self.args.skip_lines, sniff_limit=self.args.sniff_limit, column_types=self.get_column_types(), **self.reader_kwargs ) column_ids = parse_column_identifiers( self.args.columns, table.column_names, self.get_column_offset() ) table = table.order_by(column_ids, reverse=self.args.reverse) table.to_csv(self.output_file, **self.writer_kwargs)
def test_parse_column_identifiers(self): self.assertEqual([2, 0, 1], parse_column_identifiers(' i_work_here, 1,name ', self.headers)) self.assertEqual([2, 1, 1], parse_column_identifiers(' i_work_here, 1,name ', self.headers, zero_based=True))
def test_parse_column_identifiers(self): self.assertEqual([2, 0, 1], parse_column_identifiers('i_work_here,1,name', self.headers)) self.assertEqual([2, 1, 1], parse_column_identifiers('i_work_here,1,name', self.headers, column_offset=0))
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniffer.sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) f = six.StringIO(contents) rows = agate.reader(f, **kwargs) try: if no_header_row: # Peek at a row to infer column names from, and put it back on top row = next(rows) rows = itertools.chain([row], rows) headers = make_default_headers(len(row)) else: headers = next(rows) except StopIteration: # The file is `/dev/null`. headers = [] pass if no_header_row or column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] width = len(data_columns) for i, row in enumerate(rows): j = 0 for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break j += 1 # Populate remaining columns with None while j < width: data_columns[j].append(None) j += 1 columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name)
def main(self): if self.args.names_only: self.print_column_names() return operations = [op for op in OPERATIONS.keys() if getattr(self.args, op + '_only')] if len(operations) > 1: self.argparser.error('Only one operation argument may be specified (--mean, --median, etc).') if operations and self.args.csv_output: self.argparser.error('You may not specify --csv and an operation (--mean, --median, etc) at the same time.') if operations and self.args.count_only: self.argparser.error('You may not specify --count and an operation (--mean, --median, etc) at the same time.') if six.PY2: self.output_file = codecs.getwriter('utf-8')(self.output_file) if self.args.count_only: count = len(list(agate.csv.reader(self.input_file))) if not self.args.no_header_row: count -= 1 self.output_file.write('Row count: %i\n' % count) return table = agate.Table.from_csv( self.input_file, skip_lines=self.args.skip_lines, sniff_limit=self.args.sniff_limit, **self.reader_kwargs ) column_ids = parse_column_identifiers( self.args.columns, table.column_names, self.get_column_offset() ) kwargs = {} if self.args.freq_count: kwargs['freq_count'] = self.args.freq_count # Output a single stat if operations: if len(column_ids) == 1: self.print_one(table, column_ids[0], operations[0], label=False, **kwargs) else: for column_id in column_ids: self.print_one(table, column_id, operations[0], **kwargs) else: stats = {} for column_id in column_ids: stats[column_id] = self.calculate_stats(table, column_id, **kwargs) # Output as CSV if self.args.csv_output: self.print_csv(table, column_ids, stats) # Output all stats else: self.print_stats(table, column_ids, stats)