Beispiel #1
0
    def _ReadCSV(self, file_name, cols, required, deprecated):
        """Reads lines from file_name, yielding a list of unicode values
    corresponding to the column names in cols."""
        contents = self._GetUtf8Contents(file_name)
        if not contents:
            return

        eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents),
                                            file_name, self._problems)
        reader = csv.reader(eol_checker)  # Use excel dialect

        header = reader.next()
        header = map(lambda x: x.strip(), header)  # trim any whitespace
        header_occurrences = util.defaultdict(lambda: 0)
        for column_header in header:
            header_occurrences[column_header] += 1

        for name, count in header_occurrences.items():
            if count > 1:
                self._problems.DuplicateColumn(header=name,
                                               file_name=file_name,
                                               count=count)

        # check for unrecognized columns, which are often misspellings
        header_context = (file_name, 1, [''] * len(header), header)
        valid_cols = cols + [
            deprecated_name for (deprecated_name, _) in deprecated
        ]
        unknown_cols = set(header).difference(set(valid_cols))
        for col in unknown_cols:
            # this is provided in order to create a nice colored list of
            # columns in the validator output
            self._problems.UnrecognizedColumn(file_name, col, header_context)

        # check for missing required columns
        col_index = [-1] * len(cols)
        for i in range(len(cols)):
            if cols[i] in header:
                col_index[i] = header.index(cols[i])
            elif cols[i] in required:
                self._problems.MissingColumn(file_name, cols[i],
                                             header_context)

        # check for deprecated columns
        for (deprecated_name, new_name) in deprecated:
            if deprecated_name in header:
                self._problems.DeprecatedColumn(file_name, deprecated_name,
                                                new_name, header_context)

        row_num = 1
        for row in reader:
            row_num += 1
            if len(row) == 0:  # skip extra empty lines in file
                continue

            if len(row) > len(header):
                self._problems.OtherProblem(
                    'Found too many cells (commas) in line '
                    '%d of file "%s".  Every row in the file '
                    'should have the same number of cells as '
                    'the header (first line) does.' % (row_num, file_name),
                    (file_name, row_num),
                    type=problems.TYPE_WARNING)

            if len(row) < len(header):
                self._problems.OtherProblem(
                    'Found missing cells (commas) in line '
                    '%d of file "%s".  Every row in the file '
                    'should have the same number of cells as '
                    'the header (first line) does.' % (row_num, file_name),
                    (file_name, row_num),
                    type=problems.TYPE_WARNING)

            result = [None] * len(cols)
            unicode_error_columns = [
            ]  # A list of column numbers with an error
            for i in range(len(cols)):
                ci = col_index[i]
                if ci >= 0:
                    if len(row) <= ci:  # handle short CSV rows
                        result[i] = u''
                    else:
                        try:
                            result[i] = row[ci].decode('utf-8').strip()
                        except UnicodeDecodeError:
                            # Replace all invalid characters with
                            # REPLACEMENT CHARACTER (U+FFFD)
                            result[i] = codecs.getdecoder("utf8")(
                                row[ci], errors="replace")[0].strip()
                            unicode_error_columns.append(i)

            for i in unicode_error_columns:
                self._problems.InvalidValue(cols[i], result[i],
                                            'Unicode error',
                                            (file_name, row_num, result, cols))
            yield (result, row_num, cols)
Beispiel #2
0
    def _ReadCsvDict(self, file_name, cols, required, deprecated):
        """Reads lines from file_name, yielding a dict of unicode values."""
        assert file_name.endswith(".txt")
        table_name = file_name[0:-4]
        contents = self._GetUtf8Contents(file_name)
        if not contents:
            return

        eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents),
                                            file_name, self._problems)
        # The csv module doesn't provide a way to skip trailing space, but when I
        # checked 15/675 feeds had trailing space in a header row and 120 had spaces
        # after fields. Space after header fields can cause a serious parsing
        # problem, so warn. Space after body fields can cause a problem time,
        # integer and id fields; they will be validated at higher levels.
        reader = csv.reader(eol_checker, skipinitialspace=True)

        raw_header = reader.next()
        header_occurrences = util.defaultdict(lambda: 0)
        header = []
        valid_columns = []  # Index into raw_header and raw_row
        for i, h in enumerate(raw_header):
            h_stripped = h.strip()
            if not h_stripped:
                self._problems.CsvSyntax(
                    description=
                    "The header row should not contain any blank values. "
                    "The corresponding column will be skipped for the "
                    "entire file.",
                    context=(file_name, 1, [''] * len(raw_header), raw_header),
                    type=problems.TYPE_ERROR)
                continue
            elif h != h_stripped:
                self._problems.CsvSyntax(
                    description="The header row should not contain any "
                    "space characters.",
                    context=(file_name, 1, [''] * len(raw_header), raw_header),
                    type=problems.TYPE_WARNING)
            header.append(h_stripped)
            valid_columns.append(i)
            header_occurrences[h_stripped] += 1

        for name, count in header_occurrences.items():
            if count > 1:
                self._problems.DuplicateColumn(header=name,
                                               file_name=file_name,
                                               count=count)

        self._schedule._table_columns[table_name] = header

        # check for unrecognized columns, which are often misspellings
        header_context = (file_name, 1, [''] * len(header), header)
        valid_cols = cols + [
            deprecated_name for (deprecated_name, _) in deprecated
        ]
        unknown_cols = set(header) - set(valid_cols)
        if len(unknown_cols) == len(header):
            self._problems.CsvSyntax(
                description="The header row did not contain any known column "
                "names. The file is most likely missing the header row "
                "or not in the expected CSV format.",
                context=(file_name, 1, [''] * len(raw_header), raw_header),
                type=problems.TYPE_ERROR)
        else:
            for col in unknown_cols:
                # this is provided in order to create a nice colored list of
                # columns in the validator output
                self._problems.UnrecognizedColumn(file_name, col,
                                                  header_context)

        # check for missing required columns
        missing_cols = set(required) - set(header)
        for col in missing_cols:
            # this is provided in order to create a nice colored list of
            # columns in the validator output
            self._problems.MissingColumn(file_name, col, header_context)

        # check for deprecated columns
        for (deprecated_name, new_name) in deprecated:
            if deprecated_name in header:
                self._problems.DeprecatedColumn(file_name, deprecated_name,
                                                new_name, header_context)

        line_num = 1  # First line read by reader.next() above
        for raw_row in reader:
            line_num += 1
            if len(raw_row) == 0:  # skip extra empty lines in file
                continue

            if len(raw_row) > len(raw_header):
                self._problems.OtherProblem(
                    'Found too many cells (commas) in line '
                    '%d of file "%s".  Every row in the file '
                    'should have the same number of cells as '
                    'the header (first line) does.' % (line_num, file_name),
                    (file_name, line_num),
                    type=problems.TYPE_WARNING)

            if len(raw_row) < len(raw_header):
                self._problems.OtherProblem(
                    'Found missing cells (commas) in line '
                    '%d of file "%s".  Every row in the file '
                    'should have the same number of cells as '
                    'the header (first line) does.' % (line_num, file_name),
                    (file_name, line_num),
                    type=problems.TYPE_WARNING)

            # raw_row is a list of raw bytes which should be valid utf-8. Convert each
            # valid_columns of raw_row into Unicode.
            valid_values = []
            unicode_error_columns = [
            ]  # index of valid_values elements with an error
            for i in valid_columns:
                try:
                    valid_values.append(raw_row[i].decode('utf-8'))
                except UnicodeDecodeError:
                    # Replace all invalid characters with REPLACEMENT CHARACTER (U+FFFD)
                    valid_values.append(
                        codecs.getdecoder("utf8")(raw_row[i],
                                                  errors="replace")[0])
                    unicode_error_columns.append(len(valid_values) - 1)
                except IndexError:
                    break

            # The error report may contain a dump of all values in valid_values so
            # problems can not be reported until after converting all of raw_row to
            # Unicode.
            for i in unicode_error_columns:
                self._problems.InvalidValue(
                    header[i], valid_values[i], 'Unicode error',
                    (file_name, line_num, valid_values, header))

            # We strip ALL whitespace from around values.  This matches the behavior
            # of both the Google and OneBusAway GTFS parser.
            valid_values = [value.strip() for value in valid_values]

            d = dict(zip(header, valid_values))
            yield (d, line_num, header, valid_values)