コード例 #1
0
ファイル: csvgroup.py プロジェクト: dcreado/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        #Read in header and rows
        reader = CSVKitReader(self.input_file, **self.reader_kwargs)
        column_names = reader.next()
        if self.args.columns is None:
            grouped_columns_ids = []
        else:
            grouped_columns_ids = parse_column_identifiers(self.args.columns,
                                                       column_names,
                                                       self.args.zero_based)
        aggregations = []
        try:
            for (fun, cols) in map(lambda (f, cols): (
            f, parse_column_identifiers(cols, column_names, self.args.zero_based)),
                                   self.args.aggregations):
                for col in cols:
                    aggregations.append(aggregate_functions[fun](col))
        except KeyError:
            self.argparser.error("Wrong aggregator function. Available: " + ', '.join(aggregate_functions.keys()))
        #Determine columns to group by, default to all columns


        #Write the output
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        for row in group_rows(column_names, reader, grouped_columns_ids,
                              aggregations):
            output.writerow(row)
コード例 #2
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        all_column_ids = parse_column_identifiers(None,column_names, self.args.zero_based, self.args.not_columns)
        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in all_column_ids])
        d = {} # namespace dict for map_expr
        exec "def f(x): return %s"%(self.args.map_expr) in d

        for row in rows:
            out_row = []
            for c in all_column_ids:
                if c in column_ids:
                    out_row.append(d['f'](row[c]) if c <len(row) else None) 
                else:
                    out_row.append(row[c] if c <len(row) else None) 
            output.writerow(out_row)
コード例 #3
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        if self.args.no_header_row:
            row = next(rows)
            column_names = make_default_headers(len(row))
            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(None, column_names,
                                              self.args.zero_based)
        uniq_column_id = parse_column_identifiers(self.args.uniq_column,
                                                  column_names,
                                                  self.args.zero_based)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow([column_names[c] for c in column_ids])
        d = set()  # cache for used-rows
        # use tuple as keys for cache
        cache_key = lambda row: tuple([row[i] for i in uniq_column_id])
        for row in rows:
            if cache_key(row) in d: continue
            d.update([cache_key(row)])
            out_row = [row[c] if c < len(row) else None for c in column_ids]
            output.writerow(out_row)
コード例 #4
0
ファイル: test_cli.py プロジェクト: GMADIGITAL/csvkit
    def test_range_notation_open_ended(self):
        self.assertEqual([0,1,2], parse_column_identifiers(':3', self.headers))

        target = range(3,len(self.headers)) # protect against devs adding to self.headers
        target.insert(0,0)
        self.assertEqual(target, parse_column_identifiers('1,4:', self.headers))        

        self.assertEqual(range(0,len(self.headers)), parse_column_identifiers('1:', self.headers))
コード例 #5
0
 def test_parse_column_identifiers(self):
     self.assertEqual([2, 0, 1],
                      parse_column_identifiers('i_work_here,1,name',
                                               self.headers))
     self.assertEqual([2, 1, 1],
                      parse_column_identifiers('i_work_here,1,name',
                                               self.headers,
                                               column_offset=0))
コード例 #6
0
ファイル: test_cli.py プロジェクト: ruqaiya/csvkit
 def test_parse_column_identifiers(self):
     self.assertEqual([2, 0, 1],
                      parse_column_identifiers(' i_work_here, 1,name  ',
                                               self.headers))
     self.assertEqual([2, 1, 1],
                      parse_column_identifiers(' i_work_here, 1,name  ',
                                               self.headers,
                                               zero_based=True))
コード例 #7
0
ファイル: table.py プロジェクト: NickolasLapp/csvkit
    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # snifflimit == 0 means do not sniff
        if snifflimit is None:
            kwargs['dialect'] = sniffer.sniff_dialect(contents)
        elif snifflimit > 0:
            kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit])

        f = six.StringIO(contents)
        rows = CSVKitReader(f, **kwargs)

        if no_header_row:
            # Peek at a row to infer column names from
            row = next(rows) 

            headers = make_default_headers(len(row))
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
            data_columns = [[] for c in headers]

            # Put row back on top
            rows = itertools.chain([row], rows)
        else:
            headers = next(rows)
            
            if column_ids:
                column_ids = parse_column_identifiers(column_ids, headers, zero_based)
                headers = [headers[c] for c in column_ids]
            else:
                column_ids = range(len(headers))
        
            data_columns = [[] for c in headers]

        for i, row in enumerate(rows):
            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types))

        return Table(columns, name=name)
コード例 #8
0
ファイル: test_cli.py プロジェクト: xuanhan863/csvkit
 def test_range_notation_open_ended(self):
     self.assertEqual([0, 1, 2],
                      parse_column_identifiers(':3', self.headers))
     target = range(3,
                    len(self.headers) -
                    1)  # protect against devs adding to self.headers
     target.insert(0, 0)
     self.assertEqual(target,
                      parse_column_identifiers('1,4:', self.headers))
コード例 #9
0
ファイル: test_cli.py プロジェクト: xuanhan863/csvkit
 def test_range_notation(self):
     self.assertEqual([0, 1, 2],
                      parse_column_identifiers('1:3', self.headers))
     self.assertEqual([1, 2, 3],
                      parse_column_identifiers('2-4', self.headers))
     self.assertEqual([0, 1, 2, 3],
                      parse_column_identifiers('1,2:4', self.headers))
     self.assertEqual([4, 2, 5],
                      parse_column_identifiers('more-header-values,3,stuff',
                                               self.headers))
コード例 #10
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names,
                                              self.args.zero_based,
                                              self.args.not_columns)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for row in rows:
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue

            output.writerow(out_row)
コード例 #11
0
ファイル: csvsort.py プロジェクト: xuanhan863/csvkit
    def main(self):
        if self.args.names_only:
            print_column_names(self.args.file, self.output_file,
                               **self.reader_kwargs)
            return

        if self.args.file.name != '<stdin>':
            # Use filename as table name
            table_name = os.path.splitext(
                os.path.split(self.args.file.name)[1])[0]
        else:
            table_name = 'csvsql_table'

        tab = table.Table.from_csv(self.args.file,
                                   name=table_name,
                                   snifflimit=self.args.snifflimit,
                                   **self.reader_kwargs)
        column_ids = parse_column_identifiers(self.args.columns, tab.headers())

        rows = tab.to_rows(serialize_dates=True)
        rows.sort(key=lambda r: [r[c] for c in column_ids],
                  reverse=self.args.reverse)

        rows.insert(0, tab.headers())

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for row in rows:
            output.writerow(row)
コード例 #12
0
ファイル: csvsort.py プロジェクト: backgroundcheck/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if self.input_file.name != '<stdin>':
            # Use filename as table name
            table_name = os.path.splitext(os.path.split(self.input_file.name)[1])[0]
        else:
            table_name = 'csvsql_table'

        tab = table.Table.from_csv(
            self.input_file,
            name=table_name,
            snifflimit=self.args.snifflimit,
            no_header_row=self.args.no_header_row,
            infer_types=(not self.args.no_inference),
            **self.reader_kwargs
        )

        column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based)

        rows = tab.to_rows(serialize_dates=True)
        sorter = lambda r: [(r[c] is not None, r[c]) for c in column_ids]
        rows.sort(key=sorter, reverse=self.args.reverse)

        rows.insert(0, tab.headers())

        output = agate.writer(self.output_file, **self.writer_kwargs)

        for row in rows:
            output.writerow(row)
コード例 #13
0
ファイル: csvgrep.py プロジェクト: mattdudys/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.regex and not self.args.pattern and not self.args.matchfile:
            self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.")

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)
        
        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern
            
        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for i, row in enumerate(filter_reader):
            output.writerow(row)
コード例 #14
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.columns:
            self.argparser.error('You must specify at least one column to search using the -c option.')

        if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None:
            self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.')

        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)

        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern

        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for row in filter_reader:
            output.writerow(row)
コード例 #15
0
ファイル: csvsort.py プロジェクト: bucweat/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if self.input_file.name != '<stdin>':
            # Use filename as table name
            table_name = os.path.splitext(
                os.path.split(self.input_file.name)[1])[0]
        else:
            table_name = 'csvsql_table'

        tab = table.Table.from_csv(self.input_file,
                                   name=table_name,
                                   snifflimit=self.args.snifflimit,
                                   no_header_row=self.args.no_header_row,
                                   infer_types=(not self.args.no_inference),
                                   **self.reader_kwargs)

        column_ids = parse_column_identifiers(self.args.columns, tab.headers(),
                                              self.args.zero_based)

        rows = tab.to_rows(serialize_dates=True)
        sorter = lambda r: [
            r[c] if r[c] is not None else '' for c in column_ids
        ]
        rows.sort(key=sorter, reverse=self.args.reverse)

        rows.insert(0, tab.headers())

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for row in rows:
            output.writerow(row)
コード例 #16
0
ファイル: csvgrep.py プロジェクト: backgroundcheck/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.columns:
            self.argparser.error('You must specify at least one column to search using the -c option.')

        if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None:
            self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.')

        rows = agate.reader(self.input_file, **self.reader_kwargs)
        column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)

        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern

        patterns = dict((c, pattern) for c in column_ids)

        output = agate.writer(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for row in filter_reader:
            output.writerow(row)
コード例 #17
0
ファイル: csvgrep.py プロジェクト: higs4281/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.regex and not self.args.pattern and not self.args.matchfile:
            self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.")

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)
        
        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = [line.rstrip() for line in self.args.matchfile]
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern
            
        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for i, row in enumerate(filter_reader):
            output.writerow(row)
コード例 #18
0
ファイル: csvcut.py プロジェクト: JonCHodgson/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = agate.reader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)
        output = agate.writer(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for row in rows:
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue

            output.writerow(out_row)
コード例 #19
0
ファイル: csvfilter.py プロジェクト: unpingco/csvkit
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        if self.args.no_header_row:
            row = next(rows)
            column_names = make_default_headers(len(row))
            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(None, column_names, self.args.zero_based)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        # write header
        output.writerow([column_names[c] for c in column_ids])
        def float_or_else(x):
            try: return float(x)
            except ValueError: return x
        if self.args.filter_expr:
            for row in rows:
                d = {i:float_or_else(j) for i,j in zip(column_names,row)} 
                if eval(self.args.filter_expr,d): 
                    out_row = [row[c] if c < len(row) else None for c in column_ids]
                    output.writerow(out_row)
        elif self.args.not_filter_expr:
            for row in rows:
                d = {i:float_or_else(j) for i,j in zip(column_names,row)} 
                if not eval(self.args.not_filter_expr,d): 
                    out_row = [row[c] if c < len(row) else None for c in column_ids]
                    output.writerow(out_row)
コード例 #20
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names,
                                              self.args.zero_based,
                                              self.args.not_columns)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        drop_white = lambda i: re.sub('\s+$', '', re.sub('^\s+', '', i))
        for row in rows:
            out_row = [
                drop_white(row[c]) if c < len(row) else None
                for c in column_ids
            ]
            output.writerow(out_row)
コード例 #21
0
ファイル: test_cli.py プロジェクト: Mistobaan/csvkit
 def test_range_notation(self):
     self.assertEqual([0,1,2], parse_column_identifiers('1:3', self.headers))
     self.assertEqual([1,2,3], parse_column_identifiers('1:3', self.headers, zero_based=True))
     self.assertEqual([1,2,3], parse_column_identifiers('2-4', self.headers))        
     self.assertEqual([2,3,4], parse_column_identifiers('2-4', self.headers, zero_based=True))
     self.assertEqual([0,1,2,3], parse_column_identifiers('1,2:4', self.headers))        
     self.assertEqual([1,2,3,4], parse_column_identifiers('1,2:4', self.headers, zero_based=True))
     self.assertEqual([4,2,5], parse_column_identifiers('more-header-values,3,stuff', self.headers))
     self.assertEqual([4,3,5], parse_column_identifiers('more-header-values,3,stuff', self.headers,zero_based=True))
コード例 #22
0
ファイル: table.py プロジェクト: higs4281/csvkit
    def from_csv(cls,
                 f,
                 name='from_csv_table',
                 snifflimit=None,
                 column_ids=None,
                 blanks_as_nulls=True,
                 zero_based=False,
                 **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        if snifflimit:
            sample = contents[:snifflimit]
        else:
            sample = contents

        dialect = sniffer.sniff_dialect(sample)

        f = StringIO(contents)
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()

        if column_ids:
            column_ids = parse_column_identifiers(column_ids, headers,
                                                  zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))

        data_columns = [[] for c in headers]

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(row[column_ids[i]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(
                Column(column_ids[i],
                       headers[i],
                       c,
                       blanks_as_nulls=blanks_as_nulls))

        return Table(columns, name=name)
コード例 #23
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        table = agate.Table.from_csv(self.input_file, sniff_limit=self.args.sniff_limit, header=not self.args.no_header_row, column_types=self.get_column_types(), **self.reader_kwargs)
        column_ids = parse_column_identifiers(self.args.columns, table.column_names, column_offset=self.get_column_offset())
        table = table.order_by(column_ids, reverse=self.args.reverse)
        table.to_csv(self.output_file, **self.writer_kwargs)
コード例 #24
0
ファイル: cli.py プロジェクト: pombredanne/csvsed
 def main(self):
   reader = CSVKitReader(self.args.file, **self.reader_kwargs)
   cnames = reader.next()
   cids   = parse_column_identifiers(self.args.columns, cnames, self.args.zero_based)
   mods   = {idx: self.args.expr for idx in cids}
   output = CSVKitWriter(self.output_file, **self.writer_kwargs)
   reader = sed.CsvFilter(reader, mods, header=False)
   output.writerow(cnames)
   for row in reader:
     output.writerow(row)
コード例 #25
0
 def main(self):
     reader = CSVKitReader(self.args.file, **self.reader_kwargs)
     cnames = reader.next()
     cids = parse_column_identifiers(self.args.columns, cnames,
                                     self.args.zero_based)
     mods = {idx: self.args.expr for idx in cids}
     output = CSVKitWriter(self.output_file, **self.writer_kwargs)
     reader = sed.CsvFilter(reader, mods, header=False)
     output.writerow(cnames)
     for row in reader:
         output.writerow(row)
コード例 #26
0
ファイル: csvuniq.py プロジェクト: unpingco/csvkit
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        if self.args.no_header_row:
            row = next(rows)
            column_names = make_default_headers(len(row))
            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(None,column_names, self.args.zero_based)
        uniq_column_id = parse_column_identifiers(self.args.uniq_column, column_names, self.args.zero_based)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow([column_names[c] for c in column_ids])
        d = set() # cache for used-rows
        # use tuple as keys for cache
        cache_key = lambda row: tuple([row[i] for i in uniq_column_id])
        for row in rows:
            if cache_key(row) in d: continue
            d.update([ cache_key(row) ])
            out_row = [row[c] if c < len(row) else None for c in column_ids]
            output.writerow(out_row)
コード例 #27
0
ファイル: csv2ldif.py プロジェクト: dcreado/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based,
                                              self.args.not_columns)

        dn_att_id = parse_column_identifiers(self.args.uid, column_names, self.args.zero_based,
                                              self.args.not_columns)

        output = LDIFWriter(self.output_file)

        #output.writerow([column_names[c] for c in column_ids])

        for row in rows:
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue
            zipped_row = zip(column_names, map(lambda x: [x], out_row))
            zipped_row = list(v for v in zipped_row if  v[1] != None and v[1][0] != None and v[1][0] != '')

            dn = self.args.uid + "=" + out_row[dn_att_id[0]] + "," + self.args.basedn
            output.unparse(dn, zipped_row)
コード例 #28
0
ファイル: csvsort.py プロジェクト: sukic/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if self.args.no_inference:
            column_types = agate.TypeTester(limit=0)
        else:
            column_types = None

        table = agate.Table.from_csv(self.input_file, sniff_limit=self.args.sniff_limit, header=not self.args.no_header_row, column_types=column_types, **self.reader_kwargs)
        column_ids = parse_column_identifiers(self.args.columns, table.column_names, self.args.zero_based)
        table = table.order_by(lambda row: [(row[column_id] is not None, row[column_id]) for column_id in column_ids], reverse=self.args.reverse)
        table.to_csv(self.output_file, **self.writer_kwargs)
コード例 #29
0
ファイル: csvsort.py プロジェクト: dannguyen/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        # Otherwise, fails with "io.UnsupportedOperation: underlying stream is not seekable".
        if self.input_file == sys.stdin:
            # We can't sort without reading the entire input.
            self.input_file = six.StringIO(sys.stdin.read())

        table = agate.Table.from_csv(self.input_file, sniff_limit=self.args.sniff_limit, header=not self.args.no_header_row, column_types=self.get_column_types(), **self.reader_kwargs)
        column_ids = parse_column_identifiers(self.args.columns, table.column_names, column_offset=self.get_column_offset())
        table = table.order_by(column_ids, reverse=self.args.reverse)
        table.to_csv(self.output_file, **self.writer_kwargs)
コード例 #30
0
ファイル: table.py プロジェクト: SpazioDati/csvkit
    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, type_inference=True, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        if snifflimit:
            sample = contents[:snifflimit]
        else:
            sample = contents

        dialect = sniffer.sniff_dialect(sample)

        normal_type = kwargs.pop("normal_type", InvalidType)

        f = StringIO(contents)
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()
        
        if column_ids:
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))
        
        data_columns = [[] for c in headers]

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(row[column_ids[i]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, type_inference=type_inference, normal_type=normal_type))

        return Table(columns, name=name)
コード例 #31
0
ファイル: csvcut.py プロジェクト: Rethought/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for i, row in enumerate(rows):
            out_row = [row[c] if c < len(row) else None for c in column_ids] 

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue
            
            output.writerow(out_row)
コード例 #32
0
ファイル: csvsort.py プロジェクト: admariner/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if self.additional_input_expected():
            self.argparser.error(
                'You must provide an input file or piped data.')

        table = agate.Table.from_csv(self.input_file,
                                     skip_lines=self.args.skip_lines,
                                     sniff_limit=self.args.sniff_limit,
                                     column_types=self.get_column_types(),
                                     **self.reader_kwargs)

        column_ids = parse_column_identifiers(self.args.columns,
                                              table.column_names,
                                              self.get_column_offset())

        table = table.order_by(column_ids, reverse=self.args.reverse)
        table.to_csv(self.output_file, **self.writer_kwargs)
コード例 #33
0
ファイル: csv2sql.py プロジェクト: dcreado/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based,
                                              self.args.not_columns)
        columns_type = self.parse_column_types(self.args.columns_types, column_ids)
        column_names = map(lambda x: column_names[x],column_ids)


        output = self.output_file

        for row in rows:
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if ''.join(out_row) == '':
                continue

            insert_stat = "INSERT INTO " + self.args.tablename + "("
            insert_stat += ",".join(column_names)
            insert_stat += ") VALUES ("
            insert_stat += ",".join(
                map(lambda colid: columns_type[colid](row[colid]) ,
                    column_ids))
            insert_stat += ");"
            output.write('%s\n' % insert_stat)
コード例 #34
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        if self.args.no_header_row:
            row = next(rows)
            column_names = make_default_headers(len(row))
            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(None, column_names,
                                              self.args.zero_based)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        # write header
        output.writerow([column_names[c] for c in column_ids])

        def float_or_else(x):
            try:
                return float(x)
            except ValueError:
                return x

        if self.args.filter_expr:
            for row in rows:
                d = {i: float_or_else(j) for i, j in zip(column_names, row)}
                if eval(self.args.filter_expr, d):
                    out_row = [
                        row[c] if c < len(row) else None for c in column_ids
                    ]
                    output.writerow(out_row)
        elif self.args.not_filter_expr:
            for row in rows:
                d = {i: float_or_else(j) for i, j in zip(column_names, row)}
                if not eval(self.args.not_filter_expr, d):
                    out_row = [
                        row[c] if c < len(row) else None for c in column_ids
                    ]
                    output.writerow(out_row)
コード例 #35
0
ファイル: csvcut.py プロジェクト: ole-tange/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names,
                                              self.args.zero_based,
                                              self.args.not_columns)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for i, row in enumerate(rows):
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue

            output.writerow(out_row)
コード例 #36
0
ファイル: csvtrim.py プロジェクト: unpingco/csvkit
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        drop_white = lambda i:re.sub('\s+$','',re.sub('^\s+','',i))
        for row in rows:
            out_row = [drop_white(row[c]) if c < len(row) else None for c in column_ids]
            output.writerow(out_row)
コード例 #37
0
ファイル: csvsort.py プロジェクト: Rethought/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if self.args.file.name != '<stdin>':
            # Use filename as table name
            table_name = os.path.splitext(os.path.split(self.args.file.name)[1])[0]
        else:
            table_name = 'csvsql_table'

        tab = table.Table.from_csv(self.args.file, name=table_name, snifflimit=self.args.snifflimit, **self.reader_kwargs)
        column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based)

        rows = tab.to_rows(serialize_dates=True) 
        rows.sort(key=lambda r: [r[c] for c in column_ids], reverse=self.args.reverse)
        
        rows.insert(0, tab.headers())

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for row in rows:
            output.writerow(row)
コード例 #38
0
ファイル: csvrename.py プロジェクト: elcritch/csvkit
    def main(self):
        
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        # Make Headers 
        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        import sys

        # Project Column Names
        target_names = self.args.renames.split(',')
        source_column_ids = parse_column_identifiers(self.args.sources, column_names, zero_based=self.args.zero_based)

        assert len(target_names) == len(source_column_ids) and "Input sources and rename columns must be the same length!"
        
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        
        # print header from target
        output.writerow(target_names)

        # Rewrite Rows
        for row in rows:
            out_row = [row[c] if c < len(row) else None for c in source_column_ids]

            output.writerow(out_row)
コード例 #39
0
ファイル: csvsort.py プロジェクト: datamade/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if self.additional_input_expected():
            self.argparser.error('You must provide an input file or piped data.')

        table = agate.Table.from_csv(
            self.input_file,
            skip_lines=self.args.skip_lines,
            sniff_limit=self.args.sniff_limit,
            column_types=self.get_column_types(),
            **self.reader_kwargs
        )

        column_ids = parse_column_identifiers(
            self.args.columns,
            table.column_names,
            self.get_column_offset()
        )

        table = table.order_by(column_ids, reverse=self.args.reverse)
        table.to_csv(self.output_file, **self.writer_kwargs)
コード例 #40
0
ファイル: test_cli.py プロジェクト: Mistobaan/csvkit
 def test_parse_column_identifiers(self):
     self.assertEqual([2, 0, 1], parse_column_identifiers(' i_work_here, 1,name  ', self.headers))
     self.assertEqual([2, 1, 1], parse_column_identifiers(' i_work_here, 1,name  ', self.headers, zero_based=True))
コード例 #41
0
ファイル: test_cli.py プロジェクト: datamade/csvkit
 def test_parse_column_identifiers(self):
     self.assertEqual([2, 0, 1], parse_column_identifiers('i_work_here,1,name', self.headers))
     self.assertEqual([2, 1, 1], parse_column_identifiers('i_work_here,1,name', self.headers, column_offset=0))
コード例 #42
0
ファイル: table.py プロジェクト: gregorysimoes/csvkit
    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # snifflimit == 0 means do not sniff
        if snifflimit is None:
            kwargs['dialect'] = sniffer.sniff_dialect(contents)
        elif snifflimit > 0:
            kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit])

        f = six.StringIO(contents)
        rows = agate.reader(f, **kwargs)

        try:
            if no_header_row:
                # Peek at a row to infer column names from, and put it back on top
                row = next(rows)
                rows = itertools.chain([row], rows)
                headers = make_default_headers(len(row))
            else:
                headers = next(rows)
        except StopIteration:
            # The file is `/dev/null`.
            headers = []
            pass

        if no_header_row or column_ids:
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))

        data_columns = [[] for c in headers]
        width = len(data_columns)

        for i, row in enumerate(rows):
            j = 0

            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

            j += 1

            # Populate remaining columns with None
            while j < width:
                data_columns[j].append(None)

                j += 1

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types))

        return Table(columns, name=name)
コード例 #43
0
ファイル: csvstat.py プロジェクト: thevarunfactor/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        operations = [op for op in OPERATIONS.keys() if getattr(self.args, op + '_only')]

        if len(operations) > 1:
            self.argparser.error('Only one operation argument may be specified (--mean, --median, etc).')

        if operations and self.args.csv_output:
            self.argparser.error('You may not specify --csv and an operation (--mean, --median, etc) at the same time.')

        if operations and self.args.count_only:
            self.argparser.error('You may not specify --count and an operation (--mean, --median, etc) at the same time.')

        if six.PY2:
            self.output_file = codecs.getwriter('utf-8')(self.output_file)

        if self.args.count_only:
            count = len(list(agate.csv.reader(self.input_file)))

            if not self.args.no_header_row:
                count -= 1

            self.output_file.write('Row count: %i\n' % count)

            return

        table = agate.Table.from_csv(
            self.input_file,
            skip_lines=self.args.skip_lines,
            sniff_limit=self.args.sniff_limit,
            **self.reader_kwargs
        )

        column_ids = parse_column_identifiers(
            self.args.columns,
            table.column_names,
            self.get_column_offset()
        )

        kwargs = {}

        if self.args.freq_count:
            kwargs['freq_count'] = self.args.freq_count

        # Output a single stat
        if operations:
            if len(column_ids) == 1:
                self.print_one(table, column_ids[0], operations[0], label=False, **kwargs)
            else:
                for column_id in column_ids:
                    self.print_one(table, column_id, operations[0], **kwargs)
        else:
            stats = {}

            for column_id in column_ids:
                stats[column_id] = self.calculate_stats(table, column_id, **kwargs)

            # Output as CSV
            if self.args.csv_output:
                self.print_csv(table, column_ids, stats)
            # Output all stats
            else:
                self.print_stats(table, column_ids, stats)