def parse_meta(file_content, cable): """\ Extracts the reference id, date/time of creation, the classification, and the origin of the cable and assigns the value to the provided `cable`. """ end_idx = file_content.rindex("</table>") start_idx = file_content.rindex("<table class='cable'>", 0, end_idx) m = _META_PATTERN.search(file_content, start_idx, end_idx) if not m: raise ValueError('Cable table not found') if len(m.groups()) != 4: raise ValueError('Unexpected metadata result: "%r"' % m.groups()) # Table content: # Reference ID | Created | Classification | Origin ref, created, classification, origin = m.groups() if cable.reference_id != ref: reference_id = MALFORMED_CABLE_IDS.get(ref) if reference_id != cable.reference_id: reference_id = INVALID_CABLE_IDS.get(ref) if reference_id != cable.reference_id: raise ValueError('cable.reference_id != ref. reference_id="%s", ref="%s"' % (cable.reference_id, ref)) cable.created = created cable.origin = origin # classifications are usually written in upper case, but you never know.. cable.classification = classification.upper() # Try to find media IRIs start_idx = file_content.rfind(u'Appears in these', start_idx, end_idx) if start_idx > 0: cable.media_uris = _MEDIA_URLS_PATTERN.findall(file_content, start_idx, end_idx) return cable
def parse_meta(file_content, cable): """\ Extracts the reference id, date/time of creation, the classification, and the origin of the cable and assigns the value to the provided `cable`. """ end_idx = file_content.rindex("</table>") start_idx = file_content.rindex("<table class='cable'>", 0, end_idx) m = _META_PATTERN.search(file_content, start_idx, end_idx) if not m: raise ValueError('Cable table not found') if len(m.groups()) != 4: raise ValueError('Unexpected metadata result: "%r"' % m.groups()) # Table content: # Reference ID | Created | Classification | Origin ref, created, classification, origin = m.groups() if cable.reference_id != ref: reference_id = MALFORMED_CABLE_IDS.get(ref) if reference_id != cable.reference_id: reference_id = INVALID_CABLE_IDS.get(ref) if reference_id != cable.reference_id: raise ValueError( 'cable.reference_id != ref. reference_id="%s", ref="%s"' % (cable.reference_id, ref)) cable.created = created cable.origin = origin # classifications are usually written in upper case, but you never know.. cable.classification = classification.upper() # Try to find media IRIs start_idx = file_content.rfind(u'Appears in these', start_idx, end_idx) if start_idx > 0: cable.media_uris = _MEDIA_URLS_PATTERN.findall(file_content, start_idx, end_idx) return cable
def canonicalize_id(reference_id): """\ Returns the canonicalized form of the provided reference_id. WikiLeaks provides some malformed cable identifiers. If the provided `reference_id` is not valid, this method returns the valid reference identifier equivalent. If the reference identifier is valid, the reference id is returned unchanged. Note: The returned canonicalized identifier may not be a valid WikiLeaks identifier anymore. In most cases the returned canonical form is identical to the WikiLeaks identifier, but for malformed cable identifiers like "09SECTION01OF03SANJOSE525" it is not (becomes "09SANJOSE525"). `reference_id` The cable identifier to canonicalize """ if u'EMBASSY' in reference_id: return reference_id.replace(u'EMBASSY', u'') m = _C14N_PATTERN.match(reference_id) if m: origin = m.group(1) return reference_id.replace(origin, _C14N_FIXES[origin]) return MALFORMED_CABLE_IDS.get(reference_id, INVALID_CABLE_IDS.get(reference_id, reference_id))
def canonicalize_id(reference_id): """\ Returns the canonicalized form of the provided reference_id. WikiLeaks provides some malformed cable identifiers. If the provided `reference_id` is not valid, this method returns the valid reference identifier equivalent. If the reference identifier is valid, the reference id is returned unchanged. Note: The returned canonicalized identifier may not be a valid WikiLeaks identifier anymore. In most cases the returned canonical form is identical to the WikiLeaks identifier, but for malformed cable identifiers like "09SECTION01OF03SANJOSE525" it is not (becomes "09SANJOSE525"). `reference_id` The cable identifier to canonicalize """ if u'EMBASSY' in reference_id: return reference_id.replace(u'EMBASSY', u'') m = _C14N_PATTERN.match(reference_id) if m: origin = m.group(1) return reference_id.replace(origin, _C14N_FIXES[origin]) return MALFORMED_CABLE_IDS.get( reference_id, INVALID_CABLE_IDS.get(reference_id, reference_id))