Example #1
0
def find_malformed_ids(in_dir):
    dct = {}
    for root, dirs, files in os.walk(in_dir):
        for name in (n for n in files if '.html' in n):
            reference_id = name[:name.rindex('.')]
            if not REFERENCE_ID_PATTERN.match(reference_id):
                dct[reference_id] = os.path.join(root, name)
    return dct
Example #2
0
def find_malformed_ids(in_dir):
    dct = {}
    for root, dirs, files in os.walk(in_dir):
        for name in (n for n in files if '.html' in n):
            reference_id = name[:name.rindex('.')]
            if not REFERENCE_ID_PATTERN.match(reference_id):
                dct[reference_id] = os.path.join(root, name)
    return dct
Example #3
0
def parse_references(content, year, reference_id=None, canonicalize=True):
    """\
    Returns the references to other cables as (maybe empty) list.
    
    `content`
        The content of the cable.
    `year`
        The year when the cable was created.
    `reference_id`
        The reference identifier of the cable.
    `canonicalize`
        Indicates if the cable reference origin should be canonicalized.
        (enabled by default)
    """
    from cablemap.core.models import Reference
    def format_year(y):
        y = str(y)
        if not y:
            y = str(year)
        if len(y) == 4:
            return y[2:]
        elif len(y) == 3 and y[0] == '0':
            return y[1:]
        return y
    offset = 0
    m_offset = _REF_OFFSET_PATTERN.search(content)
    if m_offset:
        offset = m_offset.end()
    # 1. Try to find "Classified By:"
    m_stop = _REF_STOP_PATTERN.search(content, offset)
    # If found, use it as maximum index to search for references, otherwise use a constant
    max_idx = m_stop and m_stop.start() or _MAX_HEADER_IDX
    # 2. Find references
    m_start = _REF_START_PATTERN.search(content, offset, max_idx)
    # 3. Check if we have a paragraph in the references
    m_stop = _REF_NOT_REF_PATTERN.search(content, m_start and m_start.end() or 0, max_idx)
    last_end = m_start and m_start.end() or 0
    # 4. Find the next max_idx
    max_idx = min(m_stop and m_stop.start() or _MAX_HEADER_IDX, max_idx)
    m_end = _REF_LAST_REF_PATTERN.search(content, last_end, max_idx)
    while m_end:
        last_end = m_end.end()
        m_end = _REF_LAST_REF_PATTERN.search(content, last_end, max_idx)
    res = []
    if m_end and not m_start:
        logger.warn('Found ref end but no start in "%s", content: "%s"' % (reference_id, content))
    if m_start and last_end:
        start = m_start.start(1)
        end = last_end or m_start.end()
        refs = content[start:end].replace('\n', ' ')
        refs = _CLEAN_REFS_PATTERN.sub('', refs)
        for enum, y, origin, sn, alt_year in _REF_PATTERN.findall(refs):
            if alt_year and not y:
                y = alt_year
            y = format_year(y)
            origin = origin.replace(' ', '').replace(u"'", u'').upper()
            if origin == 'AND' and res and res[-1].is_cable():
                last_origin = _REF_ORIGIN_PATTERN.match(res[-1].value).group(1)
                origin = last_origin
                enum = enum or res[-1].name
            elif origin.startswith('AND') and res and res[-1].is_cable(): # for references like 09 FOO 1234 AND BAR 1234
                origin = origin[3:]
                enum = enum or res[-1].name
            reference = u'%s%s%d' % (y, origin, int(sn))
            if canonicalize:
                reference = canonicalize_id(reference)
            length = len(reference)
            if length < 7 or length > 25: # constants.MIN_ORIGIN_LENGTH + constants.MIN_SERIAL_LENGTH + length of year or constants.MAX_ORIGIN_LENGTH + constants.MAX_SERIAL_LENGTH + 2 (for the year) 
                continue
            if not REFERENCE_ID_PATTERN.match(reference):
                if 'CORRUPTION' not in reference and 'ECRET' not in reference and 'PARISPOINT' not in reference and 'TELCON' not in reference and 'FORTHE' not in reference and 'ZOCT' not in reference and 'ZSEP' not in reference and 'ZMAY' not in reference and 'ZNOV' not in reference and 'ZAUG' not in reference and 'PRIORITY' not in reference and 'ZJAN' not in reference and 'ZFEB' not in reference and 'ZJUN' not in reference and'ZJUL' not in reference and 'PREVIO' not in reference and 'SEPTEMBER' not in reference and 'ZAPR' not in reference and 'ZFEB' not in reference and 'PART' not in reference and 'ONFIDENTIAL' not in reference and 'SECRET' not in reference and 'SECTION' not in reference and 'TODAY' not in reference and 'DAILY' not in reference and 'OUTOF' not in reference and 'PROVIDING' not in reference and 'NUMBER' not in reference and 'APRIL' not in reference and 'OCTOBER' not in reference and 'MAIL' not in reference and 'DECEMBER' not in reference and 'FEBRUAY' not in reference and 'AUGUST' not in reference and 'MARCH' not in reference and 'JULY' not in reference and 'JUNE' not in reference and 'MAIL' not in reference and 'JANUARY' not in reference and '--' not in reference and 'PARAGRAPH' not in reference and 'ANDPREVIOUS' not in reference and 'UNCLAS' not in reference and 'ONMARCH' not in reference and 'ONAPRIL' not in reference and 'FEBRUARY' not in reference and 'ONMAY' not in reference and 'ONJULY' not in reference and 'ONJUNE' not in reference and 'NOVEMBER' not in reference and not 'CONFIDENTIAL' in reference:
                    logger.debug('Ignore "%s". Not a valid reference identifier (%s)' % (reference, reference_id))
                continue
            if reference != reference_id:
                reference = Reference(reference, consts.REF_KIND_CABLE, enum)
                if reference not in res:
                    res.append(reference)
    return res
Example #4
0
def parse_references(content, year, reference_id=None, canonicalize=True):
    """\
    Returns the references to other cables as (maybe empty) list.
    
    `content`
        The content of the cable.
    `year`
        The year when the cable was created.
    `reference_id`
        The reference identifier of the cable.
    `canonicalize`
        Indicates if the cable reference origin should be canonicalized.
        (enabled by default)
    """
    from cablemap.core.models import Reference

    def format_year(y):
        y = str(y)
        if not y:
            y = str(year)
        if len(y) == 4:
            return y[2:]
        elif len(y) == 3 and y[0] == '0':
            return y[1:]
        return y

    offset = 0
    m_offset = _REF_OFFSET_PATTERN.search(content)
    if m_offset:
        offset = m_offset.end()
    # 1. Try to find "Classified By:"
    m_stop = _REF_STOP_PATTERN.search(content, offset)
    # If found, use it as maximum index to search for references, otherwise use a constant
    max_idx = m_stop and m_stop.start() or _MAX_HEADER_IDX
    # 2. Find references
    m_start = _REF_START_PATTERN.search(content, offset, max_idx)
    # 3. Check if we have a paragraph in the references
    m_stop = _REF_NOT_REF_PATTERN.search(content, m_start and m_start.end()
                                         or 0, max_idx)
    last_end = m_start and m_start.end() or 0
    # 4. Find the next max_idx
    max_idx = min(m_stop and m_stop.start() or _MAX_HEADER_IDX, max_idx)
    m_end = _REF_LAST_REF_PATTERN.search(content, last_end, max_idx)
    while m_end:
        last_end = m_end.end()
        m_end = _REF_LAST_REF_PATTERN.search(content, last_end, max_idx)
    res = []
    if m_end and not m_start:
        logger.warn('Found ref end but no start in "%s", content: "%s"' %
                    (reference_id, content))
    if m_start and last_end:
        start = m_start.start(1)
        end = last_end or m_start.end()
        refs = content[start:end].replace('\n', ' ')
        refs = _CLEAN_REFS_PATTERN.sub('', refs)
        for enum, y, origin, sn, alt_year in _REF_PATTERN.findall(refs):
            if alt_year and not y:
                y = alt_year
            y = format_year(y)
            origin = origin.replace(' ', '').replace(u"'", u'').upper()
            if origin == 'AND' and res and res[-1].is_cable():
                last_origin = _REF_ORIGIN_PATTERN.match(res[-1].value).group(1)
                origin = last_origin
                enum = enum or res[-1].value
            elif origin.startswith('AND') and res and res[-1].is_cable(
            ):  # for references like 09 FOO 1234 AND BAR 1234
                origin = origin[3:]
                enum = enum or res[-1].value
            reference = u'%s%s%d' % (y, origin, int(sn))
            if canonicalize:
                reference = canonicalize_id(reference)
            length = len(reference)
            if length < 7 or length > 25:  # constants.MIN_ORIGIN_LENGTH + constants.MIN_SERIAL_LENGTH + length of year or constants.MAX_ORIGIN_LENGTH + constants.MAX_SERIAL_LENGTH + 2 (for the year)
                continue
            if not REFERENCE_ID_PATTERN.match(reference):
                if 'CORRUPTION' not in reference and 'ECRET' not in reference and 'PARISPOINT' not in reference and 'TELCON' not in reference and 'FORTHE' not in reference and 'ZOCT' not in reference and 'ZSEP' not in reference and 'ZMAY' not in reference and 'ZNOV' not in reference and 'ZAUG' not in reference and 'PRIORITY' not in reference and 'ZJAN' not in reference and 'ZFEB' not in reference and 'ZJUN' not in reference and 'ZJUL' not in reference and 'PREVIO' not in reference and 'SEPTEMBER' not in reference and 'ZAPR' not in reference and 'ZFEB' not in reference and 'PART' not in reference and 'ONFIDENTIAL' not in reference and 'SECRET' not in reference and 'SECTION' not in reference and 'TODAY' not in reference and 'DAILY' not in reference and 'OUTOF' not in reference and 'PROVIDING' not in reference and 'NUMBER' not in reference and 'APRIL' not in reference and 'OCTOBER' not in reference and 'MAIL' not in reference and 'DECEMBER' not in reference and 'FEBRUAY' not in reference and 'AUGUST' not in reference and 'MARCH' not in reference and 'JULY' not in reference and 'JUNE' not in reference and 'MAIL' not in reference and 'JANUARY' not in reference and '--' not in reference and 'PARAGRAPH' not in reference and 'ANDPREVIOUS' not in reference and 'UNCLAS' not in reference and 'ONMARCH' not in reference and 'ONAPRIL' not in reference and 'FEBRUARY' not in reference and 'ONMAY' not in reference and 'ONJULY' not in reference and 'ONJUNE' not in reference and 'NOVEMBER' not in reference and not 'CONFIDENTIAL' in reference:
                    logger.debug(
                        'Ignore "%s". Not a valid reference identifier (%s)' %
                        (reference, reference_id))
                continue
            if reference != reference_id:
                reference = Reference(reference, consts.REF_KIND_CABLE, enum)
                if reference not in res:
                    res.append(reference)
    return res