Ejemplo n.º 1
0
    def main(self):
        if len(self.args.files) < 2:
            self.argparser.error('You must specify at least two files to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.args.files] 
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.args.files):
                self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.')
        else:
            groups = None
                
        group_name = self.args.group_name if self.args.group_name else 'group'

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.args.files):
            rows = CSVKitReader(f, **self.reader_kwargs)
            headers = rows.next()

            if i == 0:
                if groups:
                    headers.insert(0, group_name)
                
                output.writerow(headers)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)
Ejemplo n.º 2
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = rows.next()

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for i, row in enumerate(rows):
            out_row = [row[c] if c < len(row) else None for c in column_ids] 

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue
            
            output.writerow(out_row)
Ejemplo n.º 3
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        #Read in header and rows
        reader = CSVKitReader(self.input_file, **self.reader_kwargs)
        column_names = reader.next()
        if self.args.columns is None:
            grouped_columns_ids = []
        else:
            grouped_columns_ids = parse_column_identifiers(self.args.columns,
                                                       column_names,
                                                       self.args.zero_based)
        aggregations = []
        try:
            for (fun, cols) in map(lambda (f, cols): (
            f, parse_column_identifiers(cols, column_names, self.args.zero_based)),
                                   self.args.aggregations):
                for col in cols:
                    aggregations.append(aggregate_functions[fun](col))
        except KeyError:
            self.argparser.error("Wrong aggregator function. Available: " + ', '.join(aggregate_functions.keys()))
        #Determine columns to group by, default to all columns


        #Write the output
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        for row in group_rows(column_names, reader, grouped_columns_ids,
                              aggregations):
            output.writerow(row)
Ejemplo n.º 4
0
    def from_csv(cls, f, name='from_csv_table', **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        sample = contents
        dialect = sniffer.sniff_dialect(sample)

        f = StringIO(contents)
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()

        data_columns = [[] for c in headers]

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(d.strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(i, headers[i], c))

        return Table(columns, name=name)
Ejemplo n.º 5
0
    def main(self):
        if len(self.args.files) < 2:
            sys.exit('You must specify at least two files to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.args.files]
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.args.files):
                sys.exit(
                    'The number of grouping values must be equal to the number of CSV files being stacked.'
                )
        else:
            groups = None

        group_name = self.args.group_name if self.args.group_name else 'group'

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.args.files):
            rows = CSVKitReader(f, **self.reader_kwargs)
            headers = rows.next()

            if i == 0:
                if groups:
                    headers.insert(0, group_name)

                output.writerow(headers)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)
Ejemplo n.º 6
0
    def from_csv(cls, f, name='from_csv_table', **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        sample = contents
        dialect = sniffer.sniff_dialect(sample, **kwargs)

        f = StringIO(contents) 
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()

        data_columns = [[] for c in headers] 

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(d.strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns): 
            columns.append(Column(i, headers[i], c))

        return Table(columns, name=name)
Ejemplo n.º 7
0
    def main(self):
        rows = CSVKitReader(self.args.file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = rows.next()

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = rows.next()

        column_names = self.args.columns.split(',')

        part_count = 0
        output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs)
        output.writerow(column_names)

        count = 0
        for row in rows:
            if (self.args.lines > 0) and (count == self.args.lines):
                part_count += 1
                count = 0
                # couldn't find a better way to close the file
                del output
                output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs)
                output.writerow(column_names)

            output.writerow(row)
            count += 1
Ejemplo n.º 8
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.regex and not self.args.pattern and not self.args.matchfile:
            self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.")

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)
        
        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern
            
        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for i, row in enumerate(filter_reader):
            output.writerow(row)
Ejemplo n.º 9
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.regex and not self.args.pattern and not self.args.matchfile:
            self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.")

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)
        
        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = [line.rstrip() for line in self.args.matchfile]
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern
            
        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for i, row in enumerate(filter_reader):
            output.writerow(row)
Ejemplo n.º 10
0
    def test_string_match(self):
        args = [
            '-c', '1', '-m', 'ILLINOIS',
            'examples/realdata/FY09_EDU_Recipients_by_State.csv'
        ]
        output_file = StringIO.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), [
            'State Name', 'State Abbreviate', 'Code',
            'Montgomery GI Bill-Active Duty',
            'Montgomery GI Bill- Selective Reserve',
            'Dependents\' Educational Assistance',
            'Reserve Educational Assistance Program',
            'Post-Vietnam Era Veteran\'s Educational Assistance Program',
            'TOTAL', ''
        ])
        self.assertEqual(reader.next(), [
            'ILLINOIS', 'IL', '17', '15,659', '2,491', '2,025', '1,770', '19',
            '21,964', ''
        ])
Ejemplo n.º 11
0
    def main(self):
        """
        Convert CSV to JSON. 
        """
        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        stream = codecs.getwriter('utf-8')(self.output_file)

        if self.args.key:
            output = {}

            for row in rows:
                row_dict = dict(zip(column_names, row))
                k = row_dict[self.args.key]

                if k in output:
                    raise NonUniqueKeyColumnException(
                        'Value %s is not unique in the key column.' %
                        unicode(k))

                output[k] = row_dict
        else:
            output = [dict(zip(column_names, row)) for row in rows]

        json.dump(output,
                  stream,
                  ensure_ascii=False,
                  indent=self.args.indent,
                  encoding='utf-8')
Ejemplo n.º 12
0
    def from_csv(cls,
                 f,
                 name='from_csv_table',
                 snifflimit=None,
                 column_ids=None,
                 blanks_as_nulls=True,
                 zero_based=False,
                 **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        if snifflimit:
            sample = contents[:snifflimit]
        else:
            sample = contents

        dialect = sniffer.sniff_dialect(sample)

        f = StringIO(contents)
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()

        if column_ids:
            column_ids = parse_column_identifiers(column_ids, headers,
                                                  zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))

        data_columns = [[] for c in headers]

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(row[column_ids[i]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(
                Column(column_ids[i],
                       headers[i],
                       c,
                       blanks_as_nulls=blanks_as_nulls))

        return Table(columns, name=name)
    def load(self):
        '''
            Loads the cleaned up csv files into the database
            Checks record count against csv line count
        '''
        ## get a list of tables in the database
        c = connection.cursor()
        c.execute('SHOW TABLES')
        table_list = [t[0] for t in c.fetchall()]

        ### build a dictionary of tables and the paths to the csvs for loading
        table_dict = {}
        for name in os.listdir(self.csv_dir):

            csv_path = os.path.join(
                self.csv_dir,
                name
            )

            for table in table_list:
                if table ==  name.replace('.csv', '').upper():
                    table_dict[name] = {'table_name': table, 'csv_path': csv_path}

        ## load up the data
        for csv_name, query_dict in table_dict.items():
            #print 'working on %s' % csv_name
            table_name = query_dict['table_name']
            csv_path = query_dict['csv_path']

            c.execute('DELETE FROM %s' % table_name)
            #print 'deleted records from %s' % table_name

            bulk_sql_load_part_1 = '''
                LOAD DATA LOCAL INFILE '%s'
                INTO TABLE %s
                FIELDS TERMINATED BY ','
                OPTIONALLY ENCLOSED BY '"'
                IGNORE 1 LINES
                (
            ''' % (csv_path, table_name)
            infile = open(csv_path)
            csv_reader = CSVKitReader(infile)
            headers = csv_reader.next()

            infile.close()
            infile = open(csv_path)
            csv_record_cnt = len(infile.readlines()) - 1
            infile.close()

            sql_fields = ['`%s`' % h for h in headers]
            bulk_sql_load =  bulk_sql_load_part_1 + ','.join(sql_fields) + ')'
            cnt = c.execute(bulk_sql_load)
            transaction.commit_unless_managed()

            # check load, make sure record count matches
            if cnt == csv_record_cnt:
                print "record counts match\t\t\t\t%s" % csv_name
            else:
                print 'table_cnt: %s\tcsv_lines: %s\t\t%s' % (cnt, csv_record_cnt, csv_name)
Ejemplo n.º 14
0
    def __init__(self, schema):
        self.fields = [] # A list of FixedWidthFields

        schema_reader = CSVKitReader(schema)
        schema_decoder = SchemaDecoder(schema_reader.next())

        for row in schema_reader:
            self.fields.append(schema_decoder(row))
Ejemplo n.º 15
0
def infer_types(f, sample_size=100):
    reader = CSVKitReader(f)
    headers = reader.next()

    sample = islice(reader, sample_size)
    normal_types, normal_values = normalize_table(sample)

    return zip(headers, [t.__name__ for t in normal_types])
Ejemplo n.º 16
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        # Make a default header row if none exists
        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_names = list(column_names)

        # prepend 'line_number' column with line numbers if --linenumbers option
        if self.args.line_numbers:
            column_names.insert(0, 'line_number')
            rows = [list(itertools.chain([str(i + 1)], row)) for i, row in enumerate(rows)]


        # Convert to normal list of rows
        rows = list(rows)

        # Insert the column names at the top
        rows.insert(0, column_names)

        widths = []

        for row in rows:
            for i, v in enumerate(row):
                try:
                    if len(v) > widths[i]:
                        widths[i] = len(v)
                except IndexError:
                    widths.append(len(v))

        # Dashes span each width with '+' character at intersection of
        # horizontal and vertical dividers.
        divider = '|--' + '-+-'.join('-'* w for w in widths) + '--|'

        write = lambda t: self.output_file.write(t.encode('utf-8'))

        write('%s\n' % divider)

        for i, row in enumerate(rows):
            output = []

            for j, d in enumerate(row):
                if d is None:
                    d = ''
                output.append(' %s ' % six.text_type(d).ljust(widths[j]))

            write('| %s |\n' % ('|'.join(output)))

            if (i == 0 or i == len(rows) - 1):
                write('%s\n' % divider)
Ejemplo n.º 17
0
    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # snifflimit == 0 means do not sniff
        if snifflimit is None:
            kwargs['dialect'] = sniffer.sniff_dialect(contents)
        elif snifflimit > 0:
            kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit])

        f = StringIO(contents)
        rows = CSVKitReader(f, **kwargs)

        if no_header_row:
            # Peek at a row to infer column names from
            row = next(rows) 

            headers = make_default_headers(len(row))
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
            data_columns = [[] for c in headers]

            # Put row back on top
            rows = itertools.chain([row], rows)
        else:
            headers = rows.next()
            
            if column_ids:
                column_ids = parse_column_identifiers(column_ids, headers, zero_based)
                headers = [headers[c] for c in column_ids]
            else:
                column_ids = range(len(headers))
        
            data_columns = [[] for c in headers]

        for i, row in enumerate(rows):
            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types))

        return Table(columns, name=name)
Ejemplo n.º 18
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        # Make a default header row if none exists
        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_names = list(column_names)

        # prepend 'line_number' column with line numbers if --linenumbers option
        if self.args.line_numbers:
            column_names.insert(0, 'line_number')
            rows = [
                list(itertools.chain([str(i + 1)], row))
                for i, row in enumerate(rows)
            ]

        # Convert to normal list of rows
        rows = list(rows)

        # Insert the column names at the top
        rows.insert(0, column_names)

        widths = []

        for row in rows:
            for i, v in enumerate(row):
                try:
                    if len(v) > widths[i]:
                        widths[i] = len(v)
                except IndexError:
                    widths.append(len(v))

        # Dashes span each width with '+' character at intersection of
        # horizontal and vertical dividers.
        divider = '|--' + '-+-'.join('-' * w for w in widths) + '--|'

        self.output_file.write('%s\n' % divider)

        for i, row in enumerate(rows):
            output = []

            for j, d in enumerate(row):
                if d is None:
                    d = ''
                output.append(' %s ' % six.text_type(d).ljust(widths[j]))

            self.output_file.write('| %s |\n' % ('|'.join(output)))

            if (i == 0 or i == len(rows) - 1):
                self.output_file.write('%s\n' % divider)
Ejemplo n.º 19
0
def print_column_names(f, output, **reader_kwargs):
    """
    Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
    """
    rows = CSVKitReader(f, **reader_kwargs)
    column_names = rows.next()

    for i, c in enumerate(column_names):
        output.write('%3i: %s\n' % (i + 1, c))
Ejemplo n.º 20
0
def extract_column_names(path, dialect_parameters, encoding='utf-8'):
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect_parameters)

        try:
            headers = reader.next()
        except UnicodeDecodeError:
            raise DataSamplingError(_('This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.') % encoding)

        return headers
Ejemplo n.º 21
0
 def main(self):
   reader = CSVKitReader(self.args.file, **self.reader_kwargs)
   cnames = reader.next()
   cids   = parse_column_identifiers(self.args.columns, cnames, self.args.zero_based)
   mods   = {idx: self.args.expr for idx in cids}
   output = CSVKitWriter(self.output_file, **self.writer_kwargs)
   reader = sed.CsvFilter(reader, mods, header=False)
   output.writerow(cnames)
   for row in reader:
     output.writerow(row)
Ejemplo n.º 22
0
 def main(self):
     reader = CSVKitReader(self.args.file, **self.reader_kwargs)
     cnames = reader.next()
     cids = parse_column_identifiers(self.args.columns, cnames,
                                     self.args.zero_based)
     mods = {idx: self.args.expr for idx in cids}
     output = CSVKitWriter(self.output_file, **self.writer_kwargs)
     reader = sed.CsvFilter(reader, mods, header=False)
     output.writerow(cnames)
     for row in reader:
         output.writerow(row)
Ejemplo n.º 23
0
    def __init__(self, schema):
        self.fields = [] # A list of FixedWidthFields

        schema_reader = CSVKitReader(schema)
        schema_decoder = SchemaDecoder(schema_reader.next())

        for i,row in enumerate(schema_reader):
            try:
                self.fields.append(schema_decoder(row))
            except Exception,e:
                raise ValueError("Error reading schema at line %i: %s" % (i + 2,e))
Ejemplo n.º 24
0
    def test_no_match(self):
        args = ['-c', '1', '-m', 'NO MATCH', 'examples/dummy.csv']
        output_file = StringIO.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a', 'b', 'c'])
Ejemplo n.º 25
0
    def test_no_match(self):
        args = ['-c', '1', '-m', 'NO MATCH', 'examples/dummy.csv']
        output_file = StringIO.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a', 'b', 'c'])
Ejemplo n.º 26
0
    def test_simple(self):
        args = ['-c', '1,3', 'examples/dummy.csv']
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a', 'c'])
        self.assertEqual(reader.next(), ['1', '3'])
Ejemplo n.º 27
0
    def test_no_header_row(self):
        args = ["-c", "2", "--no-header-row", "examples/no_header_row.csv"]
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ["column2"])
        self.assertEqual(reader.next(), ["2"])
Ejemplo n.º 28
0
    def test_with_bzip2(self):
        args = ["-c", "1,3", "examples/dummy.csv.bz2"]
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ["a", "c"])
        self.assertEqual(reader.next(), ["1", "3"])
Ejemplo n.º 29
0
    def test_include_and_exclude(self):
        args = ["-c", "1,3", "-C", "3", "examples/dummy.csv"]
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ["a"])
        self.assertEqual(reader.next(), ["1"])
Ejemplo n.º 30
0
def extract_column_names(path, dialect_parameters, encoding='utf-8'):
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect_parameters)

        try:
            headers = reader.next()
        except UnicodeDecodeError:
            raise DataSamplingError(
                'This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.'
                % encoding)

        return headers
Ejemplo n.º 31
0
    def test_include_and_exclude(self):
        args = ['-c', '1,3', '-C', '3', 'examples/dummy.csv']
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a'])
        self.assertEqual(reader.next(), ['1'])
Ejemplo n.º 32
0
    def test_string_match(self):
        args = ['-c', '1', '-m', 'ILLINOIS', 'examples/realdata/FY09_EDU_Recipients_by_State.csv']
        output_file = StringIO.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['State Name', 'State Abbreviate', 'Code', 'Montgomery GI Bill-Active Duty', 'Montgomery GI Bill- Selective Reserve', 'Dependents\' Educational Assistance', 'Reserve Educational Assistance Program', 'Post-Vietnam Era Veteran\'s Educational Assistance Program', 'TOTAL', ''])
        self.assertEqual(reader.next(), ['ILLINOIS', 'IL', '17', '15,659', '2,491', '2,025', '1,770', '19', '21,964', ''])
Ejemplo n.º 33
0
    def test_no_header_row(self):
        args = ['-c', '2', '--no-header-row', 'examples/no_header_row.csv']
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['column2'])
        self.assertEqual(reader.next(), ['2'])
Ejemplo n.º 34
0
    def test_with_bzip2(self):
        args = ['-c', '1,3', 'examples/dummy.csv.bz2']
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a', 'c'])
        self.assertEqual(reader.next(), ['1', '3'])
Ejemplo n.º 35
0
def sample_data(f, sample_size=5):
    reader = CSVKitReader(f)
    headers = reader.next()
        
    samples = []

    for i, row in enumerate(islice(reader, sample_size), start=1):
        samples.append({
            'row': i, 
            'data': row,
        })

    return samples 
Ejemplo n.º 36
0
def infer_schema(f, sample_size=100):
    reader = CSVKitReader(f)
    headers = reader.next()

    sample = islice(reader, sample_size)
    normal_types, normal_values = normalize_table(sample)
    type_names = [t.__name__ for t in normal_types]

    return [{
        'column': h,
        'simple_type': t,
        'meta_type': None,
        'indexed': False
    } for h, t in zip(headers, type_names)]
Ejemplo n.º 37
0
def sample_data(path, dialect_parameters, sample_size, encoding='utf-8'):
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect_parameters)

        try:
            reader.next() # skip headers
            samples = []

            for row in islice(reader, sample_size):
                samples.append(row)
        except UnicodeDecodeError:
            raise DataSamplingError(_('This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.') % (encoding))

        return samples
Ejemplo n.º 38
0
    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, type_inference=True, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        if snifflimit:
            sample = contents[:snifflimit]
        else:
            sample = contents

        dialect = sniffer.sniff_dialect(sample)

        normal_type = kwargs.pop("normal_type", InvalidType)

        f = StringIO(contents)
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()
        
        if column_ids:
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))
        
        data_columns = [[] for c in headers]

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(row[column_ids[i]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, type_inference=type_inference, normal_type=normal_type))

        return Table(columns, name=name)
Ejemplo n.º 39
0
    def test_no_header_row(self):
        # stack two CSV files
        args = ['--no-header-row', 'examples/no_header_row.csv', 'examples/no_header_row2.csv']
        output_file = StringIO.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next()[0], 'column1')
        self.assertEqual(reader.next()[0], '1')
        self.assertEqual(reader.next()[0], '4')
Ejemplo n.º 40
0
    def test_no_grouping(self):
        # stack two CSV files
        args = ['examples/dummy.csv', 'examples/dummy2.csv']
        output_file = StringIO.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a', 'b', 'c'])
        self.assertEqual(reader.next()[0], '1')
        self.assertEqual(reader.next()[0], '1')
Ejemplo n.º 41
0
    def test_explicit_grouping(self):
        # stack two CSV files
        args = ['--groups', 'asd,sdf', '-n', 'foo', 'examples/dummy.csv', 'examples/dummy2.csv']
        output_file = StringIO.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['foo', 'a', 'b', 'c'])
        self.assertEqual(reader.next()[0], 'asd')
        self.assertEqual(reader.next()[0], 'sdf')
Ejemplo n.º 42
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.columns:
            self.argparser.error('You must specify at least one column to search using the -c option.')

        if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None:
            self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.')

        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)

        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern

        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for row in filter_reader:
            output.writerow(row)
Ejemplo n.º 43
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        if self.args.no_header_row:
            row = next(rows)
            column_names = make_default_headers(len(row))
            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(None, column_names,
                                              self.args.zero_based)
        uniq_column_id = parse_column_identifiers(self.args.uniq_column,
                                                  column_names,
                                                  self.args.zero_based)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow([column_names[c] for c in column_ids])
        d = set()  # cache for used-rows
        # use tuple as keys for cache
        cache_key = lambda row: tuple([row[i] for i in uniq_column_id])
        for row in rows:
            if cache_key(row) in d: continue
            d.update([cache_key(row)])
            out_row = [row[c] if c < len(row) else None for c in column_ids]
            output.writerow(out_row)
Ejemplo n.º 44
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        all_column_ids = parse_column_identifiers(None,column_names, self.args.zero_based, self.args.not_columns)
        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in all_column_ids])
        d = {} # namespace dict for map_expr
        exec "def f(x): return %s"%(self.args.map_expr) in d

        for row in rows:
            out_row = []
            for c in all_column_ids:
                if c in column_ids:
                    out_row.append(d['f'](row[c]) if c <len(row) else None) 
                else:
                    out_row.append(row[c] if c <len(row) else None) 
            output.writerow(out_row)
Ejemplo n.º 45
0
def sample_data(path, dialect_parameters, sample_size, encoding='utf-8'):
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect_parameters)

        try:
            reader.next()  # skip headers
            samples = []

            for row in islice(reader, sample_size):
                samples.append(row)
        except UnicodeDecodeError:
            raise DataSamplingError(
                'This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.'
                % (encoding))

        return samples
Ejemplo n.º 46
0
    def read_csv(source, csv_stream):
        """
        Reads metadata from a CSV for a specified source name.
        """
        if not isinstance(source, Source):
            source = Source.objects.get(name=source)

        from csvkit import CSVKitReader
        rows = list(CSVKitReader(csv_stream, delimiter='\t'))
        fields = dict(enumerate(rows[0]))

        errors = []
        for row in rows[1:]:
            try:
                data = {fields[idx]: value for idx, value in enumerate(row)}
                tags = data.pop('tags', None)
                dataset = Dataset(**data)
                dataset.source = source
                dataset.save()

                if tags:
                    dataset.tags.add(*parse_tags(tags))
            except Exception, e:
                logger.exception('Cannot import a dataset from CSV')
                errors.append(repr(e))
Ejemplo n.º 47
0
    def main(self):
        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        rows = list(rows)

        widths = []

        for row in rows:
            for i, v in enumerate(row):
                try:
                    if len(v) > widths[i]:
                        widths[i] = len(v)
                except IndexError:
                    widths.append(len(v))

        # Dashes span each width with '+' character at intersection of
        # horizontal and vertical dividers.
        divider = '|--' + '-+-'.join('-'* w for w in widths) + '--|'

        self.output_file.write('%s\n' % divider)

        for i, row in enumerate(rows):
            output = []

            for j, d in enumerate(row):
                if d is None:
                    d = ''
                output.append(' %s ' % unicode(d).ljust(widths[j]))

            self.output_file.write(('| %s |\n' % ('|'.join(output))).encode('utf-8'))

            if i == 0 or i == len(rows) - 1:
                self.output_file.write('%s\n' % divider)
Ejemplo n.º 48
0
    def main(self):
        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        rows = list(rows)

        widths = []

        for row in rows:
            for i, v in enumerate(row):
                try:
                    if len(v) > widths[i]:
                        widths[i] = len(v)
                except IndexError:
                    widths.append(len(v))

        # Width of the fields, plus space between, plus fore and aft dividers
        divider = '-' * (sum(widths) + (3 * len(widths)) + 3)

        self.output_file.write('%s\n' % divider)

        for i, row in enumerate(rows):
            self.input_line_number = i + 1
            output = []

            for j, d in enumerate(row):
                if d is None:
                    d = ''
                output.append(' %s ' % unicode(d).ljust(widths[j]))

            self.output_file.write(
                ('| %s |\n' % ('|'.join(output))).encode('utf-8'))

            if i == 0 or i == len(rows) - 1:
                self.output_file.write('%s\n' % divider)
Ejemplo n.º 49
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names,
                                              self.args.zero_based,
                                              self.args.not_columns)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        drop_white = lambda i: re.sub('\s+$', '', re.sub('^\s+', '', i))
        for row in rows:
            out_row = [
                drop_white(row[c]) if c < len(row) else None
                for c in column_ids
            ]
            output.writerow(out_row)
Ejemplo n.º 50
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if len(self.input_files) < 2:
            self.argparser.error('You must specify at least two files to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.input_files] 
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.input_files):
                self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.')
        else:
            groups = None
                
        group_name = self.args.group_name if self.args.group_name else 'group'

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.input_files):
            rows = CSVKitReader(f, **self.reader_kwargs)

            # If we have header rows, use them
            if not self.args.no_header_row:
                headers = next(rows, [])

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)
            # If we don't generate simple column names based on first row
            else:
                row = next(rows, [])

                headers = make_default_headers(len(row))

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)

                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            f.close()
 def get_headers(self, csv_path):
     """
     Returns the column headers from the csv as a list.
     """
     with open(csv_path, 'r') as infile:
         csv_reader = CSVKitReader(infile)
         headers = next(csv_reader)
     return headers
Ejemplo n.º 52
0
    def main(self):
        reader = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.dryrun:
            checker = RowChecker(reader)

            for row in checker.checked_rows():
                pass

            if checker.errors:
                for e in checker.errors:
                    self.output_file.write('Line %i: %s\n' %
                                           (e.line_number, e.msg))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write(
                    '%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n'
                    % (checker.rows_joined, checker.joins))
        else:
            base, ext = splitext(self.input_file.name)

            with open('%s_out.csv' % base, 'w') as f:
                clean_writer = CSVKitWriter(f, **self.writer_kwargs)

                checker = RowChecker(reader)
                clean_writer.writerow(checker.column_names)

                for row in checker.checked_rows():
                    clean_writer.writerow(row)

            if checker.errors:
                error_filename = '%s_err.csv' % base

                with open(error_filename, 'w') as f:
                    error_writer = CSVKitWriter(f, **self.writer_kwargs)

                    error_header = ['line_number', 'msg']
                    error_header.extend(checker.column_names)
                    error_writer.writerow(error_header)

                    error_count = len(checker.errors)

                    for e in checker.errors:
                        error_writer.writerow(self._format_error_row(e))

                self.output_file.write(
                    '%i error%s logged to %s\n' %
                    (error_count, '' if error_count == 1 else 's',
                     error_filename))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write(
                    '%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n'
                    % (checker.rows_joined, checker.joins))
Ejemplo n.º 53
0
Archivo: cli.py Proyecto: pallih/csvkit
    def print_column_names(self):
        """
        Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
        """
        f = self.args.file
        output = self.output_file
        try:
            zero_based = self.args.zero_based
        except:
            zero_based = False

        rows = CSVKitReader(f, **self.reader_kwargs)
        column_names = rows.next()

        for i, c in enumerate(column_names):
            if not zero_based:
                i += 1
            output.write("%3i: %s\n" % (i, c))
Ejemplo n.º 54
0
    def print_column_names(self):
        """
        Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
        """
        f = self.args.file
        output = self.output_file
        try:
            zero_based = self.args.zero_based
        except:
            zero_based = False

        rows = CSVKitReader(f, **self.reader_kwargs)
        column_names = rows.next()

        for i, c in enumerate(column_names):
            if not zero_based:
                i += 1
            output.write('%3i: %s\n' % (i, c))
Ejemplo n.º 55
0
def load_data(input_file):
    for file in glob.glob(input_file):
        print file
        open_file = open(input_file)
        grasp = CSVKitReader(open_file, encoding='utf-8', delimiter='\t')
        #grasp = csv.reader(open_file, delimiter="\t")
        grasp.next()  # skip header
#        bad_rows = []
        for row in grasp:
            #assert len(row) == VALID_COLUMN_NO
            try:
                one_snp_json = _map_line_to_json(row)
            #if one_snp_json:
                yield one_snp_json
            except:
                diff_rows = enumerate(row)
                wrong = [(i, row) for (i, row) in diff_rows]
                print wrong[-1]
                 
        open_file.close()
Ejemplo n.º 56
0
    def print_column_names(self):
        """
        Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
        """
        if self.args.no_header_row:
            raise RequiredHeaderError, 'You cannot use --no-header-row with the -n or --names options.'

        f = self.args.file
        output = self.output_file
        try:
            zero_based=self.args.zero_based
        except:
            zero_based=False

        rows = CSVKitReader(f, **self.reader_kwargs)
        column_names = rows.next()

        for i, c in enumerate(column_names):
            if not zero_based:
                i += 1
            output.write('%3i: %s\n' % (i, c))