Example #1
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if not self.input_files:
            self.argparser.error('You must specify at least one file to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.input_files]
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.input_files):
                self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.')
        else:
            groups = None

        group_name = self.args.group_name if self.args.group_name else 'group'

        output = agate.csv.writer(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.input_files):
            rows = agate.csv.reader(f, **self.reader_kwargs)

            # If we have header rows, use them
            if not self.args.no_header_row:
                headers = next(rows, [])

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)
            # If we don't generate simple column names based on first row
            else:
                row = next(rows, [])

                headers = make_default_headers(len(row))

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)

                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            f.close()
Example #2
0
    def main(self):
        if self.additional_input_expected():
            sys.stderr.write(
                'No input file or piped data provided. Waiting for standard input:\n'
            )

        reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
        writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
        if self.args.no_header_row:
            # Peek at a row to get the number of columns.
            _row = next(reader)
            reader = itertools.chain([_row], reader)
            headers = make_default_headers(len(_row))
            writer.writerow(headers)

        writer.writerows(reader)
Example #3
0
    def main(self):
        if sys.stdin.isatty() and not self.args.input_paths:
            sys.stderr.write(
                'No input file or piped data provided. Waiting for standard input:\n'
            )

        has_groups = self.args.group_by_filenames or self.args.groups

        if self.args.groups and not self.args.group_by_filenames:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.args.input_paths):
                self.argparser.error(
                    'The number of grouping values must be equal to the number of CSV files being stacked.'
                )
        else:
            groups = None

        group_name = self.args.group_name if self.args.group_name else 'group'

        output = agate.csv.writer(self.output_file, **self.writer_kwargs)

        for i, path in enumerate(self.args.input_paths):
            f = self._open_input_file(path)

            if isinstance(self.args.skip_lines, int):
                skip_lines = self.args.skip_lines
                while skip_lines > 0:
                    f.readline()
                    skip_lines -= 1
            else:
                raise ValueError('skip_lines argument must be an int')

            rows = agate.csv.reader(f, **self.reader_kwargs)

            if has_groups:
                if groups:
                    group = groups[i]
                else:
                    group = os.path.basename(f.name)

            # If we have header rows, use them
            if not self.args.no_header_row:
                headers = next(rows, [])

                if i == 0:
                    if has_groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)
            # If we don't generate simple column names based on first row
            else:
                row = next(rows, [])

                headers = make_default_headers(len(row))

                if i == 0:
                    if has_groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)

                if has_groups:
                    row.insert(0, group)

                output.writerow(row)

            for row in rows:
                if has_groups:
                    row.insert(0, group)

                output.writerow(row)

            f.close()
Example #4
0
    def from_csv(cls, f, name='from_csv_table', sniff_limit=None, column_ids=None, blanks_as_nulls=True, column_offset=1, infer_types=True, no_header_row=False, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # sniff_limit == 0 means do not sniff
        if sniff_limit is None:
            kwargs['dialect'] = sniff_dialect(contents)
        elif sniff_limit > 0:
            kwargs['dialect'] = sniff_dialect(contents[:sniff_limit])

        f = six.StringIO(contents)
        rows = agate.csv.reader(f, **kwargs)

        try:
            if no_header_row:
                # Peek at a row to infer column names from, and put it back on top
                row = next(rows)
                rows = itertools.chain([row], rows)
                headers = make_default_headers(len(row))
            else:
                headers = next(rows)
        except StopIteration:
            # The file is `/dev/null`.
            headers = []
            pass

        if no_header_row or column_ids:
            column_ids = parse_column_identifiers(column_ids, headers, column_offset)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))

        data_columns = [[] for c in headers]
        width = len(data_columns)

        for i, row in enumerate(rows):
            j = 0

            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

            j += 1

            # Populate remaining columns with None
            while j < width:
                data_columns[j].append(None)

                j += 1

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types))

        return Table(columns, name=name)
Example #5
0
    def main(self):
        if sys.stdin.isatty() and not self.args.input_paths:
            sys.stderr.write('No input file or piped data provided. Waiting for standard input:\n')

        has_groups = self.args.group_by_filenames or self.args.groups

        if self.args.groups and not self.args.group_by_filenames:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.args.input_paths):
                self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.')
        else:
            groups = None

        group_name = self.args.group_name if self.args.group_name else 'group'

        output = agate.csv.writer(self.output_file, **self.writer_kwargs)

        for i, path in enumerate(self.args.input_paths):
            f = self._open_input_file(path)

            if isinstance(self.args.skip_lines, int):
                skip_lines = self.args.skip_lines
                while skip_lines > 0:
                    f.readline()
                    skip_lines -= 1
            else:
                raise ValueError('skip_lines argument must be an int')

            rows = agate.csv.reader(f, **self.reader_kwargs)

            if has_groups:
                if groups:
                    group = groups[i]
                else:
                    group = os.path.basename(f.name)

            # If we have header rows, use them
            if not self.args.no_header_row:
                headers = next(rows, [])

                if i == 0:
                    if has_groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)
            # If we don't generate simple column names based on first row
            else:
                row = next(rows, [])

                headers = make_default_headers(len(row))

                if i == 0:
                    if has_groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)

                if has_groups:
                    row.insert(0, group)

                output.writerow(row)

            for row in rows:
                if has_groups:
                    row.insert(0, group)

                output.writerow(row)

            f.close()
Example #6
0
    def from_csv(cls,
                 f,
                 name='from_csv_table',
                 sniff_limit=None,
                 column_ids=None,
                 blanks_as_nulls=True,
                 column_offset=1,
                 infer_types=True,
                 no_header_row=False,
                 **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # sniff_limit == 0 means do not sniff
        if sniff_limit is None:
            kwargs['dialect'] = sniff_dialect(contents)
        elif sniff_limit > 0:
            kwargs['dialect'] = sniff_dialect(contents[:sniff_limit])

        f = six.StringIO(contents)
        rows = agate.csv.reader(f, **kwargs)

        try:
            if no_header_row:
                # Peek at a row to infer column names from, and put it back on top
                row = next(rows)
                rows = itertools.chain([row], rows)
                headers = make_default_headers(len(row))
            else:
                headers = next(rows)
        except StopIteration:
            # The file is `/dev/null`.
            headers = []
            pass

        if no_header_row or column_ids:
            column_ids = parse_column_identifiers(column_ids, headers,
                                                  column_offset)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))

        data_columns = [[] for c in headers]
        width = len(data_columns)

        for i, row in enumerate(rows):
            j = 0

            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

            j += 1

            # Populate remaining columns with None
            while j < width:
                data_columns[j].append(None)

                j += 1

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(
                Column(column_ids[i],
                       headers[i],
                       c,
                       blanks_as_nulls=blanks_as_nulls,
                       infer_types=infer_types))

        return Table(columns, name=name)