def test_serialize_meta(self): metadata = Metadata() metadata.version = Version('1.0', comment='First version') metadata.date = '2017-09-01' metadata.description = Description('The LGR description', description_type='text/plain') metadata.scopes = [Scope('.', scope_type='domain')] self.lgr.metadata = metadata meta_node = etree.SubElement(self.root, 'meta') _serialize_meta(self.lgr, meta_node) version = meta_node.find('version', namespaces=NSMAP) self.assertEqual(version.text, '1.0') # LXML can return strings as bytestring in python2... # See https://mailman-mail5.webfaction.com/pipermail/lxml/2011-December/006239.html self.assertEqual('' + version.get('comment'), 'First version') date = meta_node.find('date', namespaces=NSMAP) self.assertEqual(date.text, '2017-09-01') description = meta_node.find('description', namespaces=NSMAP) self.assertEqual(description.text, 'The LGR description') self.assertEqual(description.get('type'), 'text/plain') scopes = meta_node.findall('scope', namespaces=NSMAP) self.assertEqual(len(scopes), 1) self.assertEqual(scopes[0].text, '.') self.assertEqual(scopes[0].get('type'), 'domain')
def _process_meta(self, elem): """ Process the <meta> element of an LGR XML file. """ metadata = Metadata(self.rfc7940_checks) reference_manager = ReferenceManager() MAPPER = { DATE_TAG: lambda d: metadata.set_date(d, force=self.force_mode), VALIDITY_START_TAG: lambda d: metadata.set_validity_start(d, force=self.force_mode), VALIDITY_END_TAG: lambda d: metadata.set_validity_end(d, force=self.force_mode), UNICODE_VERSION_TAG: lambda d: metadata.set_unicode_version(d, force=self.force_mode), } unicode_version_tag_found = False for child in elem: tag = child.tag logger.debug("Got '%s' element", tag) if tag in MAPPER: MAPPER[tag](child.text) if tag == UNICODE_VERSION_TAG: unicode_version_tag_found = True elif tag == VERSION_TAG: metadata.version = Version(child.text, child.get('comment', None)) elif tag == LANGUAGE_TAG: metadata.add_language(child.text, force=self.force_mode) elif tag == SCOPE_TAG: metadata.scopes.append( Scope(child.text, child.get('type', None))) elif tag == DESCRIPTION_TAG: # Seems to be an issue with CDATA/iterparse: https://bugs.launchpad.net/lxml/+bug/1788449 # For now, manually replace CRLF with LF metadata.description = Description( child.text.replace('\r\n', '\n'), child.get('type', None)) elif tag == REFERENCES_TAG: for reference in child: value = reference.text # Don't convert it to an int since ref_id may be a string ref_id = reference.get('id') comment = reference.get('comment', None) reference_manager.add_reference(value, comment=comment, ref_id=ref_id) # Since we have processed <reference> elements here, let's clean-up child.clear() else: logger.warning("Unhandled '%s' element in <meta> section", tag) self.rfc7940_checks.error('parse_xml') child.clear() self.rfc7940_checks.add_test_result('explicit_unicode_version', unicode_version_tag_found) self._lgr = LGR(name=self.filename, metadata=metadata, reference_manager=reference_manager, unicode_database=self._unicode_database)
def create(cls, name, unicode_version, validating_repertoire_name): metadata = Metadata() metadata.version = Version('1') metadata.set_unicode_version(unicode_version) lgr = LGR(name, metadata=metadata) lgr.unicode_database = unidb.manager.get_db_by_version(unicode_version) validating_repertoire = get_by_name( validating_repertoire_name) if validating_repertoire_name else None lgr_info = cls(name, lgr=lgr, validating_repertoire=validating_repertoire) return lgr_info
def union_version(first, second): """ Union two version objects. :param first: First version object to union. :param second: Other version object to union. :return: New object. """ # Check that none of the object is None before processing if first is None: return second if second is None: return first value = let_user_choose(first.value, second.value) comment = let_user_choose(first.comment, second.comment) return Version(value, comment)
def merge_version(lgr_set): """ Merge versions from LGR set. :param lgr_set: The LGRs in the set :return: The merged version object """ values = OrderedDict() comments = OrderedDict() for version in [lgr.metadata.version for lgr in lgr_set]: if not version: continue if version.value: values.update(OrderedDict.fromkeys([version.value])) if version.comment: comments.update(OrderedDict.fromkeys([version.comment])) return Version('|'.join(values.keys()), '|'.join(comments.keys()))
def test_serialize_meta_unicode(self): metadata = Metadata() metadata.version = Version('1.0 日本', comment='First version (はじめて)') metadata.description = Description( 'The LGR description containing Unicode characters: ΘΞΠ', description_type='text/plain') self.lgr.metadata = metadata meta_node = etree.SubElement(self.root, 'meta') _serialize_meta(self.lgr, meta_node) version = meta_node.find('version', namespaces=NSMAP) self.assertEqual(version.text, '1.0 日本') self.assertEqual(version.get('comment'), 'First version (はじめて)') description = meta_node.find('description', namespaces=NSMAP) self.assertEqual( description.text, 'The LGR description containing Unicode characters: ΘΞΠ') self.assertEqual(description.get('type'), 'text/plain')
def _parse_doc(self, rule_file): """ Actual parsing of document. :param rule_file: Content of the rule, as a file-like object. """ line_num = 0 for line in rule_file: line_num += 1 line = line.strip() if len(line) == 0: continue if line[0] == '#': continue reference = REFERENCE_RE.match(line) if reference is not None: ref_id = reference.group('ref_id') value = reference.group('value') comment = reference.group('comment') try: self._lgr.add_reference(value, ref_id=ref_id, comment=comment) except LGRException: logger.error("Invalid reference '%s' on line %d", line, line_num) continue version = VERSION_RE.match(line) if version is not None: version_no = version.group('version_no') date = version.group('date') comment = version.group('comment') try: self._lgr.metadata.version = Version(version_no, comment=comment) self._lgr.metadata.date = date except LGRException: logger.error("Invalid version '%s' on line %d", line, line_num) continue if UNICODE_CODEPOINT_RE.match(line) is None: logger.debug("Skipping non-parsable line %d:\n%s", line_num, line) # Line is not starting with a valid unicode code point, skip continue # Split base character from variant(s) char_variant = line.split(';') char = char_variant[0] try: [(codepoints, references)] = parse_char(char) self._lgr.add_cp(codepoints, ref=references) except ValueError: logger.error("Invalid character '%s' at line %d", char, line_num) except LGRException as exc: logger.error("Cannot add code point '%s' at line %d: %s", format_cp(codepoints), line_num, exc) if len(char_variant) > 1: preferred_variants = char_variant[1].strip() if len(preferred_variants ) > 0 and preferred_variants[0] != '#': # From RFC7940, Section 7.3. Recommended Disposition Values: # activated The resulting string should be activated for use. (This # is the same as a Preferred Variant [RFC3743].) var_type = "activated" self.insert_variant(line_num, codepoints, preferred_variants, var_type) if len(char_variant) > 2: variants = char_variant[2].strip() if len(variants) > 0 and variants[0] != '#': self.insert_variant(line_num, codepoints, variants)