Ejemplo n.º 1
0
def parse_meta(file_content, cable):
    """\
    Extracts the reference id, date/time of creation, the classification,
    and the origin of the cable and assigns the value to the provided `cable`.
    """
    end_idx = file_content.rindex("</table>")
    start_idx = file_content.rindex("<table class='cable'>", 0, end_idx)
    m = _META_PATTERN.search(file_content, start_idx, end_idx)
    if not m:
        raise ValueError('Cable table not found')
    if len(m.groups()) != 4:
        raise ValueError('Unexpected metadata result: "%r"' % m.groups())
    # Table content: 
    # Reference ID | Created | Classification | Origin
    ref, created, classification, origin = m.groups()
    if cable.reference_id != ref:
        reference_id = MALFORMED_CABLE_IDS.get(ref)
        if reference_id != cable.reference_id:
            reference_id = INVALID_CABLE_IDS.get(ref)
            if reference_id != cable.reference_id:
                raise ValueError('cable.reference_id != ref. reference_id="%s", ref="%s"' % (cable.reference_id, ref))
    cable.created = created
    cable.origin = origin
    # classifications are usually written in upper case, but you never know.. 
    cable.classification = classification.upper()
    # Try to find media IRIs
    start_idx = file_content.rfind(u'Appears in these', start_idx, end_idx)
    if start_idx > 0:
        cable.media_uris = _MEDIA_URLS_PATTERN.findall(file_content, start_idx, end_idx)
    return cable
Ejemplo n.º 2
0
def parse_meta(file_content, cable):
    """\
    Extracts the reference id, date/time of creation, the classification,
    and the origin of the cable and assigns the value to the provided `cable`.
    """
    end_idx = file_content.rindex("</table>")
    start_idx = file_content.rindex("<table class='cable'>", 0, end_idx)
    m = _META_PATTERN.search(file_content, start_idx, end_idx)
    if not m:
        raise ValueError('Cable table not found')
    if len(m.groups()) != 4:
        raise ValueError('Unexpected metadata result: "%r"' % m.groups())
    # Table content:
    # Reference ID | Created | Classification | Origin
    ref, created, classification, origin = m.groups()
    if cable.reference_id != ref:
        reference_id = MALFORMED_CABLE_IDS.get(ref)
        if reference_id != cable.reference_id:
            reference_id = INVALID_CABLE_IDS.get(ref)
            if reference_id != cable.reference_id:
                raise ValueError(
                    'cable.reference_id != ref. reference_id="%s", ref="%s"' %
                    (cable.reference_id, ref))
    cable.created = created
    cable.origin = origin
    # classifications are usually written in upper case, but you never know..
    cable.classification = classification.upper()
    # Try to find media IRIs
    start_idx = file_content.rfind(u'Appears in these', start_idx, end_idx)
    if start_idx > 0:
        cable.media_uris = _MEDIA_URLS_PATTERN.findall(file_content, start_idx,
                                                       end_idx)
    return cable
Ejemplo n.º 3
0
def canonicalize_id(reference_id):
    """\
    Returns the canonicalized form of the provided reference_id.

    WikiLeaks provides some malformed cable identifiers. If the provided `reference_id`
    is not valid, this method returns the valid reference identifier equivalent.
    If the reference identifier is valid, the reference id is returned unchanged.

    Note: The returned canonicalized identifier may not be a valid WikiLeaks identifier
    anymore. In most cases the returned canonical form is identical to the WikiLeaks
    identifier, but for malformed cable identifiers like "09SECTION01OF03SANJOSE525"
    it is not (becomes "09SANJOSE525").

    `reference_id`
        The cable identifier to canonicalize
    """
    if u'EMBASSY' in reference_id:
        return reference_id.replace(u'EMBASSY', u'')
    m = _C14N_PATTERN.match(reference_id)
    if m:
        origin = m.group(1)
        return reference_id.replace(origin, _C14N_FIXES[origin])
    return MALFORMED_CABLE_IDS.get(reference_id, INVALID_CABLE_IDS.get(reference_id, reference_id))
Ejemplo n.º 4
0
def canonicalize_id(reference_id):
    """\
    Returns the canonicalized form of the provided reference_id.

    WikiLeaks provides some malformed cable identifiers. If the provided `reference_id`
    is not valid, this method returns the valid reference identifier equivalent.
    If the reference identifier is valid, the reference id is returned unchanged.

    Note: The returned canonicalized identifier may not be a valid WikiLeaks identifier
    anymore. In most cases the returned canonical form is identical to the WikiLeaks
    identifier, but for malformed cable identifiers like "09SECTION01OF03SANJOSE525"
    it is not (becomes "09SANJOSE525").

    `reference_id`
        The cable identifier to canonicalize
    """
    if u'EMBASSY' in reference_id:
        return reference_id.replace(u'EMBASSY', u'')
    m = _C14N_PATTERN.match(reference_id)
    if m:
        origin = m.group(1)
        return reference_id.replace(origin, _C14N_FIXES[origin])
    return MALFORMED_CABLE_IDS.get(
        reference_id, INVALID_CABLE_IDS.get(reference_id, reference_id))
Ejemplo n.º 5
0
"""\
This module reports malformed cable ids.
"""
import os
import re
from cablemap.core.consts import REFERENCE_ID_PATTERN, MALFORMED_CABLE_IDS, INVALID_CABLE_IDS


def find_malformed_ids(in_dir):
    dct = {}
    for root, dirs, files in os.walk(in_dir):
        for name in (n for n in files if '.html' in n):
            reference_id = name[:name.rindex('.')]
            if not REFERENCE_ID_PATTERN.match(reference_id):
                dct[reference_id] = os.path.join(root, name)
    return dct


if __name__ == '__main__':
    import os, codecs
    if not os.path.isdir('./cable/'):
        raise Exception('Expected a directory "cable"')
    current = set(MALFORMED_CABLE_IDS.keys()) | set(INVALID_CABLE_IDS.keys())
    dct = find_malformed_ids('./cable/')
    s = set(dct.keys())
    diff = s ^ current
    if diff:
        print('difference: %r' % diff)
        for ref in diff:
            print('%s: %s' % (ref, dct.get(ref)))
Ejemplo n.º 6
0
def _get_test_cases():
    return INVALID_CABLE_IDS.iteritems()
Ejemplo n.º 7
0
def test_c14n_illegal_ids():
    def check(incorrect_id, correct_id):
        eq_(canonicalize_id(correct_id), canonicalize_id(incorrect_id))
    for incorrect_id, correct_id in INVALID_CABLE_IDS.iteritems():
        yield check, incorrect_id, correct_id
Ejemplo n.º 8
0
def test_c14n_illegal_ids():
    def check(incorrect_id, correct_id):
        eq_(canonicalize_id(correct_id), canonicalize_id(incorrect_id))

    for incorrect_id, correct_id in INVALID_CABLE_IDS.iteritems():
        yield check, incorrect_id, correct_id
Ejemplo n.º 9
0
def _get_test_cases():
    return INVALID_CABLE_IDS.iteritems()
Ejemplo n.º 10
0
# -*- coding: utf-8 -*-
"""\
This module reports malformed cable ids.
"""
import os
import re
from cablemap.core.consts import REFERENCE_ID_PATTERN, MALFORMED_CABLE_IDS, INVALID_CABLE_IDS

def find_malformed_ids(in_dir):
    dct = {}
    for root, dirs, files in os.walk(in_dir):
        for name in (n for n in files if '.html' in n):
            reference_id = name[:name.rindex('.')]
            if not REFERENCE_ID_PATTERN.match(reference_id):
                dct[reference_id] = os.path.join(root, name)
    return dct
    
if __name__ == '__main__':
    import os, codecs
    if not os.path.isdir('./cable/'):
        raise Exception('Expected a directory "cable"')
    current = set(MALFORMED_CABLE_IDS.keys()) | set(INVALID_CABLE_IDS.keys())
    dct = find_malformed_ids('./cable/')
    s = set(dct.keys())
    diff = s ^ current
    if diff:
        print('difference: %r' % diff)
        for ref in diff:
            print('%s: %s' % (ref, dct.get(ref)))