コード例 #1
0
ファイル: reader.py プロジェクト: Tooa/cablemap
def parse_meta(file_content, cable):
    """\
    Extracts the reference id, date/time of creation, the classification,
    and the origin of the cable and assigns the value to the provided `cable`.
    """
    end_idx = file_content.rindex("</table>")
    start_idx = file_content.rindex("<table class='cable'>", 0, end_idx)
    m = _META_PATTERN.search(file_content, start_idx, end_idx)
    if not m:
        raise ValueError('Cable table not found')
    if len(m.groups()) != 4:
        raise ValueError('Unexpected metadata result: "%r"' % m.groups())
    # Table content: 
    # Reference ID | Created | Classification | Origin
    ref, created, classification, origin = m.groups()
    if cable.reference_id != ref:
        reference_id = MALFORMED_CABLE_IDS.get(ref)
        if reference_id != cable.reference_id:
            reference_id = INVALID_CABLE_IDS.get(ref)
            if reference_id != cable.reference_id:
                raise ValueError('cable.reference_id != ref. reference_id="%s", ref="%s"' % (cable.reference_id, ref))
    cable.created = created
    cable.origin = origin
    # classifications are usually written in upper case, but you never know.. 
    cable.classification = classification.upper()
    # Try to find media IRIs
    start_idx = file_content.rfind(u'Appears in these', start_idx, end_idx)
    if start_idx > 0:
        cable.media_uris = _MEDIA_URLS_PATTERN.findall(file_content, start_idx, end_idx)
    return cable
コード例 #2
0
def parse_meta(file_content, cable):
    """\
    Extracts the reference id, date/time of creation, the classification,
    and the origin of the cable and assigns the value to the provided `cable`.
    """
    end_idx = file_content.rindex("</table>")
    start_idx = file_content.rindex("<table class='cable'>", 0, end_idx)
    m = _META_PATTERN.search(file_content, start_idx, end_idx)
    if not m:
        raise ValueError('Cable table not found')
    if len(m.groups()) != 4:
        raise ValueError('Unexpected metadata result: "%r"' % m.groups())
    # Table content:
    # Reference ID | Created | Classification | Origin
    ref, created, classification, origin = m.groups()
    if cable.reference_id != ref:
        reference_id = MALFORMED_CABLE_IDS.get(ref)
        if reference_id != cable.reference_id:
            reference_id = INVALID_CABLE_IDS.get(ref)
            if reference_id != cable.reference_id:
                raise ValueError(
                    'cable.reference_id != ref. reference_id="%s", ref="%s"' %
                    (cable.reference_id, ref))
    cable.created = created
    cable.origin = origin
    # classifications are usually written in upper case, but you never know..
    cable.classification = classification.upper()
    # Try to find media IRIs
    start_idx = file_content.rfind(u'Appears in these', start_idx, end_idx)
    if start_idx > 0:
        cable.media_uris = _MEDIA_URLS_PATTERN.findall(file_content, start_idx,
                                                       end_idx)
    return cable
コード例 #3
0
ファイル: reader.py プロジェクト: Tooa/cablemap
def canonicalize_id(reference_id):
    """\
    Returns the canonicalized form of the provided reference_id.

    WikiLeaks provides some malformed cable identifiers. If the provided `reference_id`
    is not valid, this method returns the valid reference identifier equivalent.
    If the reference identifier is valid, the reference id is returned unchanged.

    Note: The returned canonicalized identifier may not be a valid WikiLeaks identifier
    anymore. In most cases the returned canonical form is identical to the WikiLeaks
    identifier, but for malformed cable identifiers like "09SECTION01OF03SANJOSE525"
    it is not (becomes "09SANJOSE525").

    `reference_id`
        The cable identifier to canonicalize
    """
    if u'EMBASSY' in reference_id:
        return reference_id.replace(u'EMBASSY', u'')
    m = _C14N_PATTERN.match(reference_id)
    if m:
        origin = m.group(1)
        return reference_id.replace(origin, _C14N_FIXES[origin])
    return MALFORMED_CABLE_IDS.get(reference_id, INVALID_CABLE_IDS.get(reference_id, reference_id))
コード例 #4
0
def canonicalize_id(reference_id):
    """\
    Returns the canonicalized form of the provided reference_id.

    WikiLeaks provides some malformed cable identifiers. If the provided `reference_id`
    is not valid, this method returns the valid reference identifier equivalent.
    If the reference identifier is valid, the reference id is returned unchanged.

    Note: The returned canonicalized identifier may not be a valid WikiLeaks identifier
    anymore. In most cases the returned canonical form is identical to the WikiLeaks
    identifier, but for malformed cable identifiers like "09SECTION01OF03SANJOSE525"
    it is not (becomes "09SANJOSE525").

    `reference_id`
        The cable identifier to canonicalize
    """
    if u'EMBASSY' in reference_id:
        return reference_id.replace(u'EMBASSY', u'')
    m = _C14N_PATTERN.match(reference_id)
    if m:
        origin = m.group(1)
        return reference_id.replace(origin, _C14N_FIXES[origin])
    return MALFORMED_CABLE_IDS.get(
        reference_id, INVALID_CABLE_IDS.get(reference_id, reference_id))