def audit(osmfile):
    '''
    Performs the auditing operations on the given file. Returns a tuple
    (street_types, unnormalized_street_names), where street_types is a
    dictionary mapping unexpected street types to example street names with that
    type, and unnormalized_street_names is a set of street names that are not in
    normalized form.
    '''
    street_types = defaultdict(set)
    unnormalized_street_names = set()

    for _, elem in logging_itr(ET.iterparse(osmfile)):
        if elem.tag == "node" or elem.tag == "way":
            street_name, street_type = get_street_name_and_type(elem)

            # Check for unexpected street types
            if street_type is not None and street_type not in expected:
                street_types[street_type].add(street_name)

            # Check for badly capitalized streets
            if street_name is not None and street_name != normalize_name(street_name):
                unnormalized_street_names.add(street_name)

        if elem.tag != 'tag':
            elem.clear()

    return street_types, unnormalized_street_names
Esempio n. 2
0
def audit(osmfile):
    '''
    Performs the auditing operations on the given file. Returns a tuple
    (street_types, unnormalized_street_names), where street_types is a
    dictionary mapping unexpected street types to example street names with that
    type, and unnormalized_street_names is a set of street names that are not in
    normalized form.
    '''
    street_types = defaultdict(set)
    unnormalized_street_names = set()

    for _, elem in logging_itr(ET.iterparse(osmfile)):
        if elem.tag == "node" or elem.tag == "way":
            street_name, street_type = get_street_name_and_type(elem)

            # Check for unexpected street types
            if street_type is not None and street_type not in expected:
                street_types[street_type].add(street_name)

            # Check for badly capitalized streets
            if street_name is not None and street_name != normalize_name(street_name):
                unnormalized_street_names.add(street_name)

        if elem.tag != 'tag':
            elem.clear()

    return street_types, unnormalized_street_names
Esempio n. 3
0
 def __init__(self, *args, **kwargs):
     ScreenRegion.__init__(self, *args, **kwargs)
     self.textbox_controls = {}
     for mod_combo in [(False, False, False), (False, False, True)]:
         self.textbox_controls[mod_combo] = defaultdict(lambda key: self.type_key)
         self.textbox_controls[mod_combo][MOUSE1] = LeftActivate
         self.textbox_controls[mod_combo][MOUSE3] = RightActivate
     self.text = "dongs"
def parse_nds(nds):
    '''
    Parses the given nd elements and returns a node dictionary contaning a
    node_refs array if there are any refs.
    '''
    node = defaultdict(list)
    for nd in nds:
        if 'ref' in nd.attrib:
            node['node_refs'].append(nd.attrib['ref'])
    return dict(node)
def parse_tags(tags):
    '''
    Parses the given tag elements and returns a node dictionary. Includes a nested
    address dictionary if appropriate.
    '''
    node = defaultdict(dict)
    for tag in tags:
        k, v = tag.attrib['k'], tag.attrib['v']
        m = lower_colon.search(k)
        if m:
            if m.group(1) == 'addr' and not lower_colon.match(m.group(2)):
                node['address'][m.group(2)] = v
        else:
            node[k] = v

    return dict(node)
Esempio n. 6
0
  def _ReadCSV(self, file_name, cols, required, deprecated):
    """Reads lines from file_name, yielding a list of unicode values
    corresponding to the column names in cols."""
    contents = self._GetUtf8Contents(file_name)
    if not contents:
      return

    eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents),
                                   file_name, self._problems)
    reader = csv.reader(eol_checker)  # Use excel dialect

    header = reader.next()
    header = map(lambda x: x.strip(), header)  # trim any whitespace
    header_occurrences = util.defaultdict(lambda: 0)
    for column_header in header:
      header_occurrences[column_header] += 1

    for name, count in header_occurrences.items():
      if count > 1:
        self._problems.DuplicateColumn(
            header=name,
            file_name=file_name,
            count=count)

    # check for unrecognized columns, which are often misspellings
    header_context = (file_name, 1, [''] * len(header), header)
    valid_cols = cols + [deprecated_name for (deprecated_name, _) in deprecated]
    unknown_cols = set(header).difference(set(valid_cols))
    for col in unknown_cols:
      # this is provided in order to create a nice colored list of
      # columns in the validator output
      self._problems.UnrecognizedColumn(file_name, col, header_context)

    # check for missing required columns
    col_index = [-1] * len(cols)
    for i in range(len(cols)):
      if cols[i] in header:
        col_index[i] = header.index(cols[i])
      elif cols[i] in required:
        self._problems.MissingColumn(file_name, cols[i], header_context)

    # check for deprecated columns
    for (deprecated_name, new_name) in deprecated:
      if deprecated_name in header:
        self._problems.DeprecatedColumn(file_name, deprecated_name, new_name,
                                        header_context)

    row_num = 1
    for row in reader:
      row_num += 1
      if len(row) == 0:  # skip extra empty lines in file
        continue

      if len(row) > len(header):
        self._problems.OtherProblem('Found too many cells (commas) in line '
                                    '%d of file "%s".  Every row in the file '
                                    'should have the same number of cells as '
                                    'the header (first line) does.' %
                                    (row_num, file_name), (file_name, row_num),
                                    type=problems.TYPE_WARNING)

      if len(row) < len(header):
        self._problems.OtherProblem('Found missing cells (commas) in line '
                                    '%d of file "%s".  Every row in the file '
                                    'should have the same number of cells as '
                                    'the header (first line) does.' %
                                    (row_num, file_name), (file_name, row_num),
                                    type=problems.TYPE_WARNING)

      result = [None] * len(cols)
      unicode_error_columns = []  # A list of column numbers with an error
      for i in range(len(cols)):
        ci = col_index[i]
        if ci >= 0:
          if len(row) <= ci:  # handle short CSV rows
            result[i] = u''
          else:
            try:
              result[i] = row[ci].decode('utf-8').strip()
            except UnicodeDecodeError:
              # Replace all invalid characters with
              # REPLACEMENT CHARACTER (U+FFFD)
              result[i] = codecs.getdecoder("utf8")(row[ci],
                                                    errors="replace")[0].strip()
              unicode_error_columns.append(i)

      for i in unicode_error_columns:
        self._problems.InvalidValue(cols[i], result[i],
                                    'Unicode error',
                                    (file_name, row_num, result, cols))
      yield (result, row_num, cols)
Esempio n. 7
0
  def _ReadCsvDict(self, file_name, cols, required, deprecated):
    """Reads lines from file_name, yielding a dict of unicode values."""
    assert file_name.endswith(".txt")
    table_name = file_name[0:-4]
    contents = self._GetUtf8Contents(file_name)
    if not contents:
      return

    eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents),
                                   file_name, self._problems)
    # The csv module doesn't provide a way to skip trailing space, but when I
    # checked 15/675 feeds had trailing space in a header row and 120 had spaces
    # after fields. Space after header fields can cause a serious parsing
    # problem, so warn. Space after body fields can cause a problem time,
    # integer and id fields; they will be validated at higher levels.
    reader = csv.reader(eol_checker, skipinitialspace=True)

    raw_header = reader.next()
    header_occurrences = util.defaultdict(lambda: 0)
    header = []
    valid_columns = []  # Index into raw_header and raw_row
    for i, h in enumerate(raw_header):
      h_stripped = h.strip()
      if not h_stripped:
        self._problems.CsvSyntax(
            description="The header row should not contain any blank values. "
                        "The corresponding column will be skipped for the "
                        "entire file.",
            context=(file_name, 1, [''] * len(raw_header), raw_header),
            type=problems.TYPE_ERROR)
        continue
      elif h != h_stripped:
        self._problems.CsvSyntax(
            description="The header row should not contain any "
                        "space characters.",
            context=(file_name, 1, [''] * len(raw_header), raw_header),
            type=problems.TYPE_WARNING)
      header.append(h_stripped)
      valid_columns.append(i)
      header_occurrences[h_stripped] += 1

    for name, count in header_occurrences.items():
      if count > 1:
        self._problems.DuplicateColumn(
            header=name,
            file_name=file_name,
            count=count)

    self._schedule._table_columns[table_name] = header

    # check for unrecognized columns, which are often misspellings
    header_context = (file_name, 1, [''] * len(header), header)
    valid_cols = cols + [deprecated_name for (deprecated_name, _) in deprecated]
    unknown_cols = set(header) - set(valid_cols)
    if len(unknown_cols) == len(header):
      self._problems.CsvSyntax(
            description="The header row did not contain any known column "
                        "names. The file is most likely missing the header row "
                        "or not in the expected CSV format.",
            context=(file_name, 1, [''] * len(raw_header), raw_header),
            type=problems.TYPE_ERROR)
    else:
      for col in unknown_cols:
        # this is provided in order to create a nice colored list of
        # columns in the validator output
        self._problems.UnrecognizedColumn(file_name, col, header_context)

    # check for missing required columns
    missing_cols = set(required) - set(header)
    for col in missing_cols:
      # this is provided in order to create a nice colored list of
      # columns in the validator output
      self._problems.MissingColumn(file_name, col, header_context)

    # check for deprecated columns
    for (deprecated_name, new_name) in deprecated:
      if deprecated_name in header:
        self._problems.DeprecatedColumn(file_name, deprecated_name, new_name,
                                        header_context)

    line_num = 1  # First line read by reader.next() above
    for raw_row in reader:
      line_num += 1
      if len(raw_row) == 0:  # skip extra empty lines in file
        continue

      if len(raw_row) > len(raw_header):
        self._problems.OtherProblem('Found too many cells (commas) in line '
                                    '%d of file "%s".  Every row in the file '
                                    'should have the same number of cells as '
                                    'the header (first line) does.' %
                                    (line_num, file_name),
                                    (file_name, line_num),
                                    type=problems.TYPE_WARNING)

      if len(raw_row) < len(raw_header):
        self._problems.OtherProblem('Found missing cells (commas) in line '
                                    '%d of file "%s".  Every row in the file '
                                    'should have the same number of cells as '
                                    'the header (first line) does.' %
                                    (line_num, file_name),
                                    (file_name, line_num),
                                    type=problems.TYPE_WARNING)

      # raw_row is a list of raw bytes which should be valid utf-8. Convert each
      # valid_columns of raw_row into Unicode.
      valid_values = []
      unicode_error_columns = []  # index of valid_values elements with an error
      for i in valid_columns:
        try:
          valid_values.append(raw_row[i].decode('utf-8'))
        except UnicodeDecodeError:
          # Replace all invalid characters with REPLACEMENT CHARACTER (U+FFFD)
          valid_values.append(codecs.getdecoder("utf8")
                              (raw_row[i], errors="replace")[0])
          unicode_error_columns.append(len(valid_values) - 1)
        except IndexError:
          break

      # The error report may contain a dump of all values in valid_values so
      # problems can not be reported until after converting all of raw_row to
      # Unicode.
      for i in unicode_error_columns:
        self._problems.InvalidValue(header[i], valid_values[i],
                                    'Unicode error',
                                    (file_name, line_num,
                                     valid_values, header))


      d = dict(zip(header, valid_values))
      yield (d, line_num, header, valid_values)
Esempio n. 8
0
    def _ReadCSV(self, file_name, cols, required, deprecated):
        """Reads lines from file_name, yielding a list of unicode values
    corresponding to the column names in cols."""
        contents = self._GetUtf8Contents(file_name)
        if not contents:
            return

        eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents),
                                            file_name, self._problems)
        reader = csv.reader(eol_checker)  # Use excel dialect

        header = reader.next()
        header = map(lambda x: x.strip(), header)  # trim any whitespace
        header_occurrences = util.defaultdict(lambda: 0)
        for column_header in header:
            header_occurrences[column_header] += 1

        for name, count in header_occurrences.items():
            if count > 1:
                self._problems.DuplicateColumn(header=name,
                                               file_name=file_name,
                                               count=count)

        # check for unrecognized columns, which are often misspellings
        header_context = (file_name, 1, [''] * len(header), header)
        valid_cols = cols + [
            deprecated_name for (deprecated_name, _) in deprecated
        ]
        unknown_cols = set(header).difference(set(valid_cols))
        for col in unknown_cols:
            # this is provided in order to create a nice colored list of
            # columns in the validator output
            self._problems.UnrecognizedColumn(file_name, col, header_context)

        # check for missing required columns
        col_index = [-1] * len(cols)
        for i in range(len(cols)):
            if cols[i] in header:
                col_index[i] = header.index(cols[i])
            elif cols[i] in required:
                self._problems.MissingColumn(file_name, cols[i],
                                             header_context)

        # check for deprecated columns
        for (deprecated_name, new_name) in deprecated:
            if deprecated_name in header:
                self._problems.DeprecatedColumn(file_name, deprecated_name,
                                                new_name, header_context)

        row_num = 1
        for row in reader:
            row_num += 1
            if len(row) == 0:  # skip extra empty lines in file
                continue

            if len(row) > len(header):
                self._problems.OtherProblem(
                    'Found too many cells (commas) in line '
                    '%d of file "%s".  Every row in the file '
                    'should have the same number of cells as '
                    'the header (first line) does.' % (row_num, file_name),
                    (file_name, row_num),
                    type=problems.TYPE_WARNING)

            if len(row) < len(header):
                self._problems.OtherProblem(
                    'Found missing cells (commas) in line '
                    '%d of file "%s".  Every row in the file '
                    'should have the same number of cells as '
                    'the header (first line) does.' % (row_num, file_name),
                    (file_name, row_num),
                    type=problems.TYPE_WARNING)

            result = [None] * len(cols)
            unicode_error_columns = [
            ]  # A list of column numbers with an error
            for i in range(len(cols)):
                ci = col_index[i]
                if ci >= 0:
                    if len(row) <= ci:  # handle short CSV rows
                        result[i] = u''
                    else:
                        try:
                            result[i] = row[ci].decode('utf-8').strip()
                        except UnicodeDecodeError:
                            # Replace all invalid characters with
                            # REPLACEMENT CHARACTER (U+FFFD)
                            result[i] = codecs.getdecoder("utf8")(
                                row[ci], errors="replace")[0].strip()
                            unicode_error_columns.append(i)

            for i in unicode_error_columns:
                self._problems.InvalidValue(cols[i], result[i],
                                            'Unicode error',
                                            (file_name, row_num, result, cols))
            yield (result, row_num, cols)
Esempio n. 9
0
    def _ReadCsvDict(self, file_name, cols, required, deprecated):
        """Reads lines from file_name, yielding a dict of unicode values."""
        assert file_name.endswith(".txt")
        table_name = file_name[0:-4]
        contents = self._GetUtf8Contents(file_name)
        if not contents:
            return

        eol_checker = util.EndOfLineChecker(StringIO.StringIO(contents),
                                            file_name, self._problems)
        # The csv module doesn't provide a way to skip trailing space, but when I
        # checked 15/675 feeds had trailing space in a header row and 120 had spaces
        # after fields. Space after header fields can cause a serious parsing
        # problem, so warn. Space after body fields can cause a problem time,
        # integer and id fields; they will be validated at higher levels.
        reader = csv.reader(eol_checker, skipinitialspace=True)

        raw_header = reader.next()
        header_occurrences = util.defaultdict(lambda: 0)
        header = []
        valid_columns = []  # Index into raw_header and raw_row
        for i, h in enumerate(raw_header):
            h_stripped = h.strip()
            if not h_stripped:
                self._problems.CsvSyntax(
                    description=
                    "The header row should not contain any blank values. "
                    "The corresponding column will be skipped for the "
                    "entire file.",
                    context=(file_name, 1, [''] * len(raw_header), raw_header),
                    type=problems.TYPE_ERROR)
                continue
            elif h != h_stripped:
                self._problems.CsvSyntax(
                    description="The header row should not contain any "
                    "space characters.",
                    context=(file_name, 1, [''] * len(raw_header), raw_header),
                    type=problems.TYPE_WARNING)
            header.append(h_stripped)
            valid_columns.append(i)
            header_occurrences[h_stripped] += 1

        for name, count in header_occurrences.items():
            if count > 1:
                self._problems.DuplicateColumn(header=name,
                                               file_name=file_name,
                                               count=count)

        self._schedule._table_columns[table_name] = header

        # check for unrecognized columns, which are often misspellings
        header_context = (file_name, 1, [''] * len(header), header)
        valid_cols = cols + [
            deprecated_name for (deprecated_name, _) in deprecated
        ]
        unknown_cols = set(header) - set(valid_cols)
        if len(unknown_cols) == len(header):
            self._problems.CsvSyntax(
                description="The header row did not contain any known column "
                "names. The file is most likely missing the header row "
                "or not in the expected CSV format.",
                context=(file_name, 1, [''] * len(raw_header), raw_header),
                type=problems.TYPE_ERROR)
        else:
            for col in unknown_cols:
                # this is provided in order to create a nice colored list of
                # columns in the validator output
                self._problems.UnrecognizedColumn(file_name, col,
                                                  header_context)

        # check for missing required columns
        missing_cols = set(required) - set(header)
        for col in missing_cols:
            # this is provided in order to create a nice colored list of
            # columns in the validator output
            self._problems.MissingColumn(file_name, col, header_context)

        # check for deprecated columns
        for (deprecated_name, new_name) in deprecated:
            if deprecated_name in header:
                self._problems.DeprecatedColumn(file_name, deprecated_name,
                                                new_name, header_context)

        line_num = 1  # First line read by reader.next() above
        for raw_row in reader:
            line_num += 1
            if len(raw_row) == 0:  # skip extra empty lines in file
                continue

            if len(raw_row) > len(raw_header):
                self._problems.OtherProblem(
                    'Found too many cells (commas) in line '
                    '%d of file "%s".  Every row in the file '
                    'should have the same number of cells as '
                    'the header (first line) does.' % (line_num, file_name),
                    (file_name, line_num),
                    type=problems.TYPE_WARNING)

            if len(raw_row) < len(raw_header):
                self._problems.OtherProblem(
                    'Found missing cells (commas) in line '
                    '%d of file "%s".  Every row in the file '
                    'should have the same number of cells as '
                    'the header (first line) does.' % (line_num, file_name),
                    (file_name, line_num),
                    type=problems.TYPE_WARNING)

            # raw_row is a list of raw bytes which should be valid utf-8. Convert each
            # valid_columns of raw_row into Unicode.
            valid_values = []
            unicode_error_columns = [
            ]  # index of valid_values elements with an error
            for i in valid_columns:
                try:
                    valid_values.append(raw_row[i].decode('utf-8'))
                except UnicodeDecodeError:
                    # Replace all invalid characters with REPLACEMENT CHARACTER (U+FFFD)
                    valid_values.append(
                        codecs.getdecoder("utf8")(raw_row[i],
                                                  errors="replace")[0])
                    unicode_error_columns.append(len(valid_values) - 1)
                except IndexError:
                    break

            # The error report may contain a dump of all values in valid_values so
            # problems can not be reported until after converting all of raw_row to
            # Unicode.
            for i in unicode_error_columns:
                self._problems.InvalidValue(
                    header[i], valid_values[i], 'Unicode error',
                    (file_name, line_num, valid_values, header))

            # We strip ALL whitespace from around values.  This matches the behavior
            # of both the Google and OneBusAway GTFS parser.
            valid_values = [value.strip() for value in valid_values]

            d = dict(zip(header, valid_values))
            yield (d, line_num, header, valid_values)