def _ReadCSV(self, file_name, cols, required, deprecated): """Reads lines from file_name, yielding a list of unicode values corresponding to the column names in cols.""" contents = self._GetUtf8Contents(file_name) if not contents: return eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents), file_name, self._problems) reader = csv.reader(eol_checker) # Use excel dialect header = reader.next() header = map(lambda x: x.strip(), header) # trim any whitespace header_occurrences = util.defaultdict(lambda: 0) for column_header in header: header_occurrences[column_header] += 1 for name, count in header_occurrences.items(): if count > 1: self._problems.DuplicateColumn(header=name, file_name=file_name, count=count) # check for unrecognized columns, which are often misspellings header_context = (file_name, 1, [''] * len(header), header) valid_cols = cols + [ deprecated_name for (deprecated_name, _) in deprecated ] unknown_cols = set(header).difference(set(valid_cols)) for col in unknown_cols: # this is provided in order to create a nice colored list of # columns in the validator output self._problems.UnrecognizedColumn(file_name, col, header_context) # check for missing required columns col_index = [-1] * len(cols) for i in range(len(cols)): if cols[i] in header: col_index[i] = header.index(cols[i]) elif cols[i] in required: self._problems.MissingColumn(file_name, cols[i], header_context) # check for deprecated columns for (deprecated_name, new_name) in deprecated: if deprecated_name in header: self._problems.DeprecatedColumn(file_name, deprecated_name, new_name, header_context) row_num = 1 for row in reader: row_num += 1 if len(row) == 0: # skip extra empty lines in file continue if len(row) > len(header): self._problems.OtherProblem( 'Found too many cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (row_num, file_name), (file_name, row_num), type=problems.TYPE_WARNING) if len(row) < len(header): self._problems.OtherProblem( 'Found missing cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (row_num, file_name), (file_name, row_num), type=problems.TYPE_WARNING) result = [None] * len(cols) unicode_error_columns = [ ] # A list of column numbers with an error for i in range(len(cols)): ci = col_index[i] if ci >= 0: if len(row) <= ci: # handle short CSV rows result[i] = u'' else: try: result[i] = row[ci].decode('utf-8').strip() except UnicodeDecodeError: # Replace all invalid characters with # REPLACEMENT CHARACTER (U+FFFD) result[i] = codecs.getdecoder("utf8")( row[ci], errors="replace")[0].strip() unicode_error_columns.append(i) for i in unicode_error_columns: self._problems.InvalidValue(cols[i], result[i], 'Unicode error', (file_name, row_num, result, cols)) yield (result, row_num, cols)
def _ReadCsvDict(self, file_name, cols, required, deprecated): """Reads lines from file_name, yielding a dict of unicode values.""" assert file_name.endswith(".txt") table_name = file_name[0:-4] contents = self._GetUtf8Contents(file_name) if not contents: return eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents), file_name, self._problems) # The csv module doesn't provide a way to skip trailing space, but when I # checked 15/675 feeds had trailing space in a header row and 120 had spaces # after fields. Space after header fields can cause a serious parsing # problem, so warn. Space after body fields can cause a problem time, # integer and id fields; they will be validated at higher levels. reader = csv.reader(eol_checker, skipinitialspace=True) raw_header = reader.next() header_occurrences = util.defaultdict(lambda: 0) header = [] valid_columns = [] # Index into raw_header and raw_row for i, h in enumerate(raw_header): h_stripped = h.strip() if not h_stripped: self._problems.CsvSyntax( description= "The header row should not contain any blank values. " "The corresponding column will be skipped for the " "entire file.", context=(file_name, 1, [''] * len(raw_header), raw_header), type=problems.TYPE_ERROR) continue elif h != h_stripped: self._problems.CsvSyntax( description="The header row should not contain any " "space characters.", context=(file_name, 1, [''] * len(raw_header), raw_header), type=problems.TYPE_WARNING) header.append(h_stripped) valid_columns.append(i) header_occurrences[h_stripped] += 1 for name, count in header_occurrences.items(): if count > 1: self._problems.DuplicateColumn(header=name, file_name=file_name, count=count) self._schedule._table_columns[table_name] = header # check for unrecognized columns, which are often misspellings header_context = (file_name, 1, [''] * len(header), header) valid_cols = cols + [ deprecated_name for (deprecated_name, _) in deprecated ] unknown_cols = set(header) - set(valid_cols) if len(unknown_cols) == len(header): self._problems.CsvSyntax( description="The header row did not contain any known column " "names. The file is most likely missing the header row " "or not in the expected CSV format.", context=(file_name, 1, [''] * len(raw_header), raw_header), type=problems.TYPE_ERROR) else: for col in unknown_cols: # this is provided in order to create a nice colored list of # columns in the validator output self._problems.UnrecognizedColumn(file_name, col, header_context) # check for missing required columns missing_cols = set(required) - set(header) for col in missing_cols: # this is provided in order to create a nice colored list of # columns in the validator output self._problems.MissingColumn(file_name, col, header_context) # check for deprecated columns for (deprecated_name, new_name) in deprecated: if deprecated_name in header: self._problems.DeprecatedColumn(file_name, deprecated_name, new_name, header_context) line_num = 1 # First line read by reader.next() above for raw_row in reader: line_num += 1 if len(raw_row) == 0: # skip extra empty lines in file continue if len(raw_row) > len(raw_header): self._problems.OtherProblem( 'Found too many cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (line_num, file_name), (file_name, line_num), type=problems.TYPE_WARNING) if len(raw_row) < len(raw_header): self._problems.OtherProblem( 'Found missing cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (line_num, file_name), (file_name, line_num), type=problems.TYPE_WARNING) # raw_row is a list of raw bytes which should be valid utf-8. Convert each # valid_columns of raw_row into Unicode. valid_values = [] unicode_error_columns = [ ] # index of valid_values elements with an error for i in valid_columns: try: valid_values.append(raw_row[i].decode('utf-8')) except UnicodeDecodeError: # Replace all invalid characters with REPLACEMENT CHARACTER (U+FFFD) valid_values.append( codecs.getdecoder("utf8")(raw_row[i], errors="replace")[0]) unicode_error_columns.append(len(valid_values) - 1) except IndexError: break # The error report may contain a dump of all values in valid_values so # problems can not be reported until after converting all of raw_row to # Unicode. for i in unicode_error_columns: self._problems.InvalidValue( header[i], valid_values[i], 'Unicode error', (file_name, line_num, valid_values, header)) # We strip ALL whitespace from around values. This matches the behavior # of both the Google and OneBusAway GTFS parser. valid_values = [value.strip() for value in valid_values] d = dict(zip(header, valid_values)) yield (d, line_num, header, valid_values)