def main(self): if self.args.names_only: self.print_column_names() return rows = agate.reader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.input_files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.input_files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.input_files): rows = CSVKitReader(f, **self.reader_kwargs) #headers = next(rows, []) # If we have header rows, use them if not self.args.no_header_row: headers = next(rows, []) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) # If we don't generate simple column names based on first row else: row = next(rows, []) headers = make_default_headers(len(row)) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) if groups: row.insert(0, groups[i]) output.writerow(row) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row) f.close()
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) drop_white = lambda i: re.sub('\s+$', '', re.sub('^\s+', '', i)) for row in rows: out_row = [ drop_white(row[c]) if c < len(row) else None for c in column_ids ] output.writerow(out_row)
def main(self): rows = CSVKitReader(self.args.file, **self.reader_kwargs) if self.args.no_header_row: row = rows.next() column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = rows.next() column_names = self.args.columns.split(',') part_count = 0 output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs) output.writerow(column_names) count = 0 for row in rows: if (self.args.lines > 0) and (count == self.args.lines): part_count += 1 count = 0 # couldn't find a better way to close the file del output output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs) output.writerow(column_names) output.writerow(row) count += 1
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) uniq_column_id = parse_column_identifiers(self.args.uniq_column, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) d = set() # cache for used-rows # use tuple as keys for cache cache_key = lambda row: tuple([row[i] for i in uniq_column_id]) for row in rows: if cache_key(row) in d: continue d.update([cache_key(row)]) out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) # write header output.writerow([column_names[c] for c in column_ids]) def float_or_else(x): try: return float(x) except ValueError: return x if self.args.filter_expr: for row in rows: d = {i:float_or_else(j) for i,j in zip(column_names,row)} if eval(self.args.filter_expr,d): out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row) elif self.args.not_filter_expr: for row in rows: d = {i:float_or_else(j) for i,j in zip(column_names,row)} if not eval(self.args.not_filter_expr,d): out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) all_column_ids = parse_column_identifiers(None,column_names, self.args.zero_based, self.args.not_columns) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in all_column_ids]) d = {} # namespace dict for map_expr exec "def f(x): return %s"%(self.args.map_expr) in d for row in rows: out_row = [] for c in all_column_ids: if c in column_ids: out_row.append(d['f'](row[c]) if c <len(row) else None) else: out_row.append(row[c] if c <len(row) else None) output.writerow(out_row)
def main(self): rows = CSVKitReader(self.args.file, **self.reader_kwargs) # Make a default header row if none exists if self.args.no_header_row: row = rows.next() column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = rows.next() column_names = list(column_names) # prepend 'line_number' column with line numbers if --linenumbers option if self.args.line_numbers: column_names.insert(0, 'line_number') rows = [ list(itertools.chain([str(i + 1)], row)) for i, row in enumerate(rows) ] # Convert to normal list of rows rows = list(rows) # Insert the column names at the top rows.insert(0, column_names) widths = [] for row in rows: for i, v in enumerate(row): try: if len(v) > widths[i]: widths[i] = len(v) except IndexError: widths.append(len(v)) # Dashes span each width with '+' character at intersection of # horizontal and vertical dividers. divider = '|--' + '-+-'.join('-' * w for w in widths) + '--|' self.output_file.write('%s\n' % divider) for i, row in enumerate(rows): output = [] for j, d in enumerate(row): if d is None: d = '' output.append(' %s ' % unicode(d).ljust(widths[j])) self.output_file.write( ('| %s |\n' % ('|'.join(output))).encode('utf-8')) if (i == 0 or i == len(rows) - 1): self.output_file.write('%s\n' % divider)
def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.input_files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.input_files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.input_files): rows = CSVKitReader(f, **self.reader_kwargs) # If we have header rows, use them if not self.args.no_header_row: headers = next(rows, []) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) # If we don't generate simple column names based on first row else: row = next(rows, []) headers = make_default_headers(len(row)) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) if groups: row.insert(0, groups[i]) output.writerow(row) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row) f.close()
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniffer.sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) f = six.StringIO(contents) rows = CSVKitReader(f, **kwargs) if no_header_row: # Peek at a row to infer column names from row = next(rows) headers = make_default_headers(len(row)) column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] data_columns = [[] for c in headers] # Put row back on top rows = itertools.chain([row], rows) else: headers = next(rows) if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] for i, row in enumerate(rows): for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) # Make a default header row if none exists if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_names = list(column_names) # prepend 'line_number' column with line numbers if --linenumbers option if self.args.line_numbers: column_names.insert(0, 'line_number') rows = [list(itertools.chain([str(i + 1)], row)) for i, row in enumerate(rows)] # Convert to normal list of rows rows = list(rows) # Insert the column names at the top rows.insert(0, column_names) widths = [] for row in rows: for i, v in enumerate(row): try: if len(v) > widths[i]: widths[i] = len(v) except IndexError: widths.append(len(v)) # Dashes span each width with '+' character at intersection of # horizontal and vertical dividers. divider = '|--' + '-+-'.join('-'* w for w in widths) + '--|' write = lambda t: self.output_file.write(t.encode('utf-8')) write('%s\n' % divider) for i, row in enumerate(rows): output = [] for j, d in enumerate(row): if d is None: d = '' output.append(' %s ' % six.text_type(d).ljust(widths[j])) write('| %s |\n' % ('|'.join(output))) if (i == 0 or i == len(rows) - 1): write('%s\n' % divider)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error( 'You must specify at least one column to search using the -c option.' ) if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error( 'One of -r, -m or -f must be specified, unless using the -n option.' ) rows = agate.reader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for row in filter_reader: output.writerow(row)
def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error('You must specify at least one column to search using the -c option.') if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.') rows = agate.reader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = agate.writer(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for row in filter_reader: output.writerow(row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None,column_names, self.args.zero_based) uniq_column_id = parse_column_identifiers(self.args.uniq_column, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) d = set() # cache for used-rows # use tuple as keys for cache cache_key = lambda row: tuple([row[i] for i in uniq_column_id]) for row in rows: if cache_key(row) in d: continue d.update([ cache_key(row) ]) out_row = [row[c] if c < len(row) else None for c in column_ids] output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) columns_type = self.parse_column_types(self.args.columns_types, column_ids) column_names = map(lambda x: column_names[x],column_ids) output = self.output_file for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if ''.join(out_row) == '': continue insert_stat = "INSERT INTO " + self.args.tablename + "(" insert_stat += ",".join(column_names) insert_stat += ") VALUES (" insert_stat += ",".join( map(lambda colid: columns_type[colid](row[colid]) , column_ids)) insert_stat += ");" output.write('%s\n' % insert_stat)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(None, column_names, self.args.zero_based) output = CSVKitWriter(self.output_file, **self.writer_kwargs) # write header output.writerow([column_names[c] for c in column_ids]) def float_or_else(x): try: return float(x) except ValueError: return x if self.args.filter_expr: for row in rows: d = {i: float_or_else(j) for i, j in zip(column_names, row)} if eval(self.args.filter_expr, d): out_row = [ row[c] if c < len(row) else None for c in column_ids ] output.writerow(out_row) elif self.args.not_filter_expr: for row in rows: d = {i: float_or_else(j) for i, j in zip(column_names, row)} if not eval(self.args.not_filter_expr, d): out_row = [ row[c] if c < len(row) else None for c in column_ids ] output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) dn_att_id = parse_column_identifiers(self.args.uid, column_names, self.args.zero_based, self.args.not_columns) output = LDIFWriter(self.output_file) #output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue zipped_row = zip(column_names, map(lambda x: [x], out_row)) zipped_row = list(v for v in zipped_row if v[1] != None and v[1][0] != None and v[1][0] != '') dn = self.args.uid + "=" + out_row[dn_att_id[0]] + "," + self.args.basedn output.unparse(dn, zipped_row)
def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) drop_white = lambda i:re.sub('\s+$','',re.sub('^\s+','',i)) for row in rows: out_row = [drop_white(row[c]) if c < len(row) else None for c in column_ids] output.writerow(out_row)
def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) # Make Headers if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) import sys # Project Column Names target_names = self.args.renames.split(',') source_column_ids = parse_column_identifiers(self.args.sources, column_names, zero_based=self.args.zero_based) assert len(target_names) == len(source_column_ids) and "Input sources and rename columns must be the same length!" output = CSVKitWriter(self.output_file, **self.writer_kwargs) # print header from target output.writerow(target_names) # Rewrite Rows for row in rows: out_row = [row[c] if c < len(row) else None for c in source_column_ids] output.writerow(out_row)
def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniffer.sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) f = six.StringIO(contents) rows = agate.reader(f, **kwargs) try: if no_header_row: # Peek at a row to infer column names from, and put it back on top row = next(rows) rows = itertools.chain([row], rows) headers = make_default_headers(len(row)) else: headers = next(rows) except StopIteration: # The file is `/dev/null`. headers = [] pass if no_header_row or column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] width = len(data_columns) for i, row in enumerate(rows): j = 0 for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break j += 1 # Populate remaining columns with None while j < width: data_columns[j].append(None) j += 1 columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name)