def audit(osmfile): ''' Performs the auditing operations on the given file. Returns a tuple (street_types, unnormalized_street_names), where street_types is a dictionary mapping unexpected street types to example street names with that type, and unnormalized_street_names is a set of street names that are not in normalized form. ''' street_types = defaultdict(set) unnormalized_street_names = set() for _, elem in logging_itr(ET.iterparse(osmfile)): if elem.tag == "node" or elem.tag == "way": street_name, street_type = get_street_name_and_type(elem) # Check for unexpected street types if street_type is not None and street_type not in expected: street_types[street_type].add(street_name) # Check for badly capitalized streets if street_name is not None and street_name != normalize_name(street_name): unnormalized_street_names.add(street_name) if elem.tag != 'tag': elem.clear() return street_types, unnormalized_street_names
def __init__(self, *args, **kwargs): ScreenRegion.__init__(self, *args, **kwargs) self.textbox_controls = {} for mod_combo in [(False, False, False), (False, False, True)]: self.textbox_controls[mod_combo] = defaultdict(lambda key: self.type_key) self.textbox_controls[mod_combo][MOUSE1] = LeftActivate self.textbox_controls[mod_combo][MOUSE3] = RightActivate self.text = "dongs"
def parse_nds(nds): ''' Parses the given nd elements and returns a node dictionary contaning a node_refs array if there are any refs. ''' node = defaultdict(list) for nd in nds: if 'ref' in nd.attrib: node['node_refs'].append(nd.attrib['ref']) return dict(node)
def parse_tags(tags): ''' Parses the given tag elements and returns a node dictionary. Includes a nested address dictionary if appropriate. ''' node = defaultdict(dict) for tag in tags: k, v = tag.attrib['k'], tag.attrib['v'] m = lower_colon.search(k) if m: if m.group(1) == 'addr' and not lower_colon.match(m.group(2)): node['address'][m.group(2)] = v else: node[k] = v return dict(node)
def _ReadCSV(self, file_name, cols, required, deprecated): """Reads lines from file_name, yielding a list of unicode values corresponding to the column names in cols.""" contents = self._GetUtf8Contents(file_name) if not contents: return eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents), file_name, self._problems) reader = csv.reader(eol_checker) # Use excel dialect header = reader.next() header = map(lambda x: x.strip(), header) # trim any whitespace header_occurrences = util.defaultdict(lambda: 0) for column_header in header: header_occurrences[column_header] += 1 for name, count in header_occurrences.items(): if count > 1: self._problems.DuplicateColumn( header=name, file_name=file_name, count=count) # check for unrecognized columns, which are often misspellings header_context = (file_name, 1, [''] * len(header), header) valid_cols = cols + [deprecated_name for (deprecated_name, _) in deprecated] unknown_cols = set(header).difference(set(valid_cols)) for col in unknown_cols: # this is provided in order to create a nice colored list of # columns in the validator output self._problems.UnrecognizedColumn(file_name, col, header_context) # check for missing required columns col_index = [-1] * len(cols) for i in range(len(cols)): if cols[i] in header: col_index[i] = header.index(cols[i]) elif cols[i] in required: self._problems.MissingColumn(file_name, cols[i], header_context) # check for deprecated columns for (deprecated_name, new_name) in deprecated: if deprecated_name in header: self._problems.DeprecatedColumn(file_name, deprecated_name, new_name, header_context) row_num = 1 for row in reader: row_num += 1 if len(row) == 0: # skip extra empty lines in file continue if len(row) > len(header): self._problems.OtherProblem('Found too many cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (row_num, file_name), (file_name, row_num), type=problems.TYPE_WARNING) if len(row) < len(header): self._problems.OtherProblem('Found missing cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (row_num, file_name), (file_name, row_num), type=problems.TYPE_WARNING) result = [None] * len(cols) unicode_error_columns = [] # A list of column numbers with an error for i in range(len(cols)): ci = col_index[i] if ci >= 0: if len(row) <= ci: # handle short CSV rows result[i] = u'' else: try: result[i] = row[ci].decode('utf-8').strip() except UnicodeDecodeError: # Replace all invalid characters with # REPLACEMENT CHARACTER (U+FFFD) result[i] = codecs.getdecoder("utf8")(row[ci], errors="replace")[0].strip() unicode_error_columns.append(i) for i in unicode_error_columns: self._problems.InvalidValue(cols[i], result[i], 'Unicode error', (file_name, row_num, result, cols)) yield (result, row_num, cols)
def _ReadCsvDict(self, file_name, cols, required, deprecated): """Reads lines from file_name, yielding a dict of unicode values.""" assert file_name.endswith(".txt") table_name = file_name[0:-4] contents = self._GetUtf8Contents(file_name) if not contents: return eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents), file_name, self._problems) # The csv module doesn't provide a way to skip trailing space, but when I # checked 15/675 feeds had trailing space in a header row and 120 had spaces # after fields. Space after header fields can cause a serious parsing # problem, so warn. Space after body fields can cause a problem time, # integer and id fields; they will be validated at higher levels. reader = csv.reader(eol_checker, skipinitialspace=True) raw_header = reader.next() header_occurrences = util.defaultdict(lambda: 0) header = [] valid_columns = [] # Index into raw_header and raw_row for i, h in enumerate(raw_header): h_stripped = h.strip() if not h_stripped: self._problems.CsvSyntax( description="The header row should not contain any blank values. " "The corresponding column will be skipped for the " "entire file.", context=(file_name, 1, [''] * len(raw_header), raw_header), type=problems.TYPE_ERROR) continue elif h != h_stripped: self._problems.CsvSyntax( description="The header row should not contain any " "space characters.", context=(file_name, 1, [''] * len(raw_header), raw_header), type=problems.TYPE_WARNING) header.append(h_stripped) valid_columns.append(i) header_occurrences[h_stripped] += 1 for name, count in header_occurrences.items(): if count > 1: self._problems.DuplicateColumn( header=name, file_name=file_name, count=count) self._schedule._table_columns[table_name] = header # check for unrecognized columns, which are often misspellings header_context = (file_name, 1, [''] * len(header), header) valid_cols = cols + [deprecated_name for (deprecated_name, _) in deprecated] unknown_cols = set(header) - set(valid_cols) if len(unknown_cols) == len(header): self._problems.CsvSyntax( description="The header row did not contain any known column " "names. The file is most likely missing the header row " "or not in the expected CSV format.", context=(file_name, 1, [''] * len(raw_header), raw_header), type=problems.TYPE_ERROR) else: for col in unknown_cols: # this is provided in order to create a nice colored list of # columns in the validator output self._problems.UnrecognizedColumn(file_name, col, header_context) # check for missing required columns missing_cols = set(required) - set(header) for col in missing_cols: # this is provided in order to create a nice colored list of # columns in the validator output self._problems.MissingColumn(file_name, col, header_context) # check for deprecated columns for (deprecated_name, new_name) in deprecated: if deprecated_name in header: self._problems.DeprecatedColumn(file_name, deprecated_name, new_name, header_context) line_num = 1 # First line read by reader.next() above for raw_row in reader: line_num += 1 if len(raw_row) == 0: # skip extra empty lines in file continue if len(raw_row) > len(raw_header): self._problems.OtherProblem('Found too many cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (line_num, file_name), (file_name, line_num), type=problems.TYPE_WARNING) if len(raw_row) < len(raw_header): self._problems.OtherProblem('Found missing cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (line_num, file_name), (file_name, line_num), type=problems.TYPE_WARNING) # raw_row is a list of raw bytes which should be valid utf-8. Convert each # valid_columns of raw_row into Unicode. valid_values = [] unicode_error_columns = [] # index of valid_values elements with an error for i in valid_columns: try: valid_values.append(raw_row[i].decode('utf-8')) except UnicodeDecodeError: # Replace all invalid characters with REPLACEMENT CHARACTER (U+FFFD) valid_values.append(codecs.getdecoder("utf8") (raw_row[i], errors="replace")[0]) unicode_error_columns.append(len(valid_values) - 1) except IndexError: break # The error report may contain a dump of all values in valid_values so # problems can not be reported until after converting all of raw_row to # Unicode. for i in unicode_error_columns: self._problems.InvalidValue(header[i], valid_values[i], 'Unicode error', (file_name, line_num, valid_values, header)) d = dict(zip(header, valid_values)) yield (d, line_num, header, valid_values)
def _ReadCSV(self, file_name, cols, required, deprecated): """Reads lines from file_name, yielding a list of unicode values corresponding to the column names in cols.""" contents = self._GetUtf8Contents(file_name) if not contents: return eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents), file_name, self._problems) reader = csv.reader(eol_checker) # Use excel dialect header = reader.next() header = map(lambda x: x.strip(), header) # trim any whitespace header_occurrences = util.defaultdict(lambda: 0) for column_header in header: header_occurrences[column_header] += 1 for name, count in header_occurrences.items(): if count > 1: self._problems.DuplicateColumn(header=name, file_name=file_name, count=count) # check for unrecognized columns, which are often misspellings header_context = (file_name, 1, [''] * len(header), header) valid_cols = cols + [ deprecated_name for (deprecated_name, _) in deprecated ] unknown_cols = set(header).difference(set(valid_cols)) for col in unknown_cols: # this is provided in order to create a nice colored list of # columns in the validator output self._problems.UnrecognizedColumn(file_name, col, header_context) # check for missing required columns col_index = [-1] * len(cols) for i in range(len(cols)): if cols[i] in header: col_index[i] = header.index(cols[i]) elif cols[i] in required: self._problems.MissingColumn(file_name, cols[i], header_context) # check for deprecated columns for (deprecated_name, new_name) in deprecated: if deprecated_name in header: self._problems.DeprecatedColumn(file_name, deprecated_name, new_name, header_context) row_num = 1 for row in reader: row_num += 1 if len(row) == 0: # skip extra empty lines in file continue if len(row) > len(header): self._problems.OtherProblem( 'Found too many cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (row_num, file_name), (file_name, row_num), type=problems.TYPE_WARNING) if len(row) < len(header): self._problems.OtherProblem( 'Found missing cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (row_num, file_name), (file_name, row_num), type=problems.TYPE_WARNING) result = [None] * len(cols) unicode_error_columns = [ ] # A list of column numbers with an error for i in range(len(cols)): ci = col_index[i] if ci >= 0: if len(row) <= ci: # handle short CSV rows result[i] = u'' else: try: result[i] = row[ci].decode('utf-8').strip() except UnicodeDecodeError: # Replace all invalid characters with # REPLACEMENT CHARACTER (U+FFFD) result[i] = codecs.getdecoder("utf8")( row[ci], errors="replace")[0].strip() unicode_error_columns.append(i) for i in unicode_error_columns: self._problems.InvalidValue(cols[i], result[i], 'Unicode error', (file_name, row_num, result, cols)) yield (result, row_num, cols)
def _ReadCsvDict(self, file_name, cols, required, deprecated): """Reads lines from file_name, yielding a dict of unicode values.""" assert file_name.endswith(".txt") table_name = file_name[0:-4] contents = self._GetUtf8Contents(file_name) if not contents: return eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents), file_name, self._problems) # The csv module doesn't provide a way to skip trailing space, but when I # checked 15/675 feeds had trailing space in a header row and 120 had spaces # after fields. Space after header fields can cause a serious parsing # problem, so warn. Space after body fields can cause a problem time, # integer and id fields; they will be validated at higher levels. reader = csv.reader(eol_checker, skipinitialspace=True) raw_header = reader.next() header_occurrences = util.defaultdict(lambda: 0) header = [] valid_columns = [] # Index into raw_header and raw_row for i, h in enumerate(raw_header): h_stripped = h.strip() if not h_stripped: self._problems.CsvSyntax( description= "The header row should not contain any blank values. " "The corresponding column will be skipped for the " "entire file.", context=(file_name, 1, [''] * len(raw_header), raw_header), type=problems.TYPE_ERROR) continue elif h != h_stripped: self._problems.CsvSyntax( description="The header row should not contain any " "space characters.", context=(file_name, 1, [''] * len(raw_header), raw_header), type=problems.TYPE_WARNING) header.append(h_stripped) valid_columns.append(i) header_occurrences[h_stripped] += 1 for name, count in header_occurrences.items(): if count > 1: self._problems.DuplicateColumn(header=name, file_name=file_name, count=count) self._schedule._table_columns[table_name] = header # check for unrecognized columns, which are often misspellings header_context = (file_name, 1, [''] * len(header), header) valid_cols = cols + [ deprecated_name for (deprecated_name, _) in deprecated ] unknown_cols = set(header) - set(valid_cols) if len(unknown_cols) == len(header): self._problems.CsvSyntax( description="The header row did not contain any known column " "names. The file is most likely missing the header row " "or not in the expected CSV format.", context=(file_name, 1, [''] * len(raw_header), raw_header), type=problems.TYPE_ERROR) else: for col in unknown_cols: # this is provided in order to create a nice colored list of # columns in the validator output self._problems.UnrecognizedColumn(file_name, col, header_context) # check for missing required columns missing_cols = set(required) - set(header) for col in missing_cols: # this is provided in order to create a nice colored list of # columns in the validator output self._problems.MissingColumn(file_name, col, header_context) # check for deprecated columns for (deprecated_name, new_name) in deprecated: if deprecated_name in header: self._problems.DeprecatedColumn(file_name, deprecated_name, new_name, header_context) line_num = 1 # First line read by reader.next() above for raw_row in reader: line_num += 1 if len(raw_row) == 0: # skip extra empty lines in file continue if len(raw_row) > len(raw_header): self._problems.OtherProblem( 'Found too many cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (line_num, file_name), (file_name, line_num), type=problems.TYPE_WARNING) if len(raw_row) < len(raw_header): self._problems.OtherProblem( 'Found missing cells (commas) in line ' '%d of file "%s". Every row in the file ' 'should have the same number of cells as ' 'the header (first line) does.' % (line_num, file_name), (file_name, line_num), type=problems.TYPE_WARNING) # raw_row is a list of raw bytes which should be valid utf-8. Convert each # valid_columns of raw_row into Unicode. valid_values = [] unicode_error_columns = [ ] # index of valid_values elements with an error for i in valid_columns: try: valid_values.append(raw_row[i].decode('utf-8')) except UnicodeDecodeError: # Replace all invalid characters with REPLACEMENT CHARACTER (U+FFFD) valid_values.append( codecs.getdecoder("utf8")(raw_row[i], errors="replace")[0]) unicode_error_columns.append(len(valid_values) - 1) except IndexError: break # The error report may contain a dump of all values in valid_values so # problems can not be reported until after converting all of raw_row to # Unicode. for i in unicode_error_columns: self._problems.InvalidValue( header[i], valid_values[i], 'Unicode error', (file_name, line_num, valid_values, header)) # We strip ALL whitespace from around values. This matches the behavior # of both the Google and OneBusAway GTFS parser. valid_values = [value.strip() for value in valid_values] d = dict(zip(header, valid_values)) yield (d, line_num, header, valid_values)