def find_malformed_ids(in_dir): dct = {} for root, dirs, files in os.walk(in_dir): for name in (n for n in files if '.html' in n): reference_id = name[:name.rindex('.')] if not REFERENCE_ID_PATTERN.match(reference_id): dct[reference_id] = os.path.join(root, name) return dct
def parse_references(content, year, reference_id=None, canonicalize=True): """\ Returns the references to other cables as (maybe empty) list. `content` The content of the cable. `year` The year when the cable was created. `reference_id` The reference identifier of the cable. `canonicalize` Indicates if the cable reference origin should be canonicalized. (enabled by default) """ from cablemap.core.models import Reference def format_year(y): y = str(y) if not y: y = str(year) if len(y) == 4: return y[2:] elif len(y) == 3 and y[0] == '0': return y[1:] return y offset = 0 m_offset = _REF_OFFSET_PATTERN.search(content) if m_offset: offset = m_offset.end() # 1. Try to find "Classified By:" m_stop = _REF_STOP_PATTERN.search(content, offset) # If found, use it as maximum index to search for references, otherwise use a constant max_idx = m_stop and m_stop.start() or _MAX_HEADER_IDX # 2. Find references m_start = _REF_START_PATTERN.search(content, offset, max_idx) # 3. Check if we have a paragraph in the references m_stop = _REF_NOT_REF_PATTERN.search(content, m_start and m_start.end() or 0, max_idx) last_end = m_start and m_start.end() or 0 # 4. Find the next max_idx max_idx = min(m_stop and m_stop.start() or _MAX_HEADER_IDX, max_idx) m_end = _REF_LAST_REF_PATTERN.search(content, last_end, max_idx) while m_end: last_end = m_end.end() m_end = _REF_LAST_REF_PATTERN.search(content, last_end, max_idx) res = [] if m_end and not m_start: logger.warn('Found ref end but no start in "%s", content: "%s"' % (reference_id, content)) if m_start and last_end: start = m_start.start(1) end = last_end or m_start.end() refs = content[start:end].replace('\n', ' ') refs = _CLEAN_REFS_PATTERN.sub('', refs) for enum, y, origin, sn, alt_year in _REF_PATTERN.findall(refs): if alt_year and not y: y = alt_year y = format_year(y) origin = origin.replace(' ', '').replace(u"'", u'').upper() if origin == 'AND' and res and res[-1].is_cable(): last_origin = _REF_ORIGIN_PATTERN.match(res[-1].value).group(1) origin = last_origin enum = enum or res[-1].name elif origin.startswith('AND') and res and res[-1].is_cable(): # for references like 09 FOO 1234 AND BAR 1234 origin = origin[3:] enum = enum or res[-1].name reference = u'%s%s%d' % (y, origin, int(sn)) if canonicalize: reference = canonicalize_id(reference) length = len(reference) if length < 7 or length > 25: # constants.MIN_ORIGIN_LENGTH + constants.MIN_SERIAL_LENGTH + length of year or constants.MAX_ORIGIN_LENGTH + constants.MAX_SERIAL_LENGTH + 2 (for the year) continue if not REFERENCE_ID_PATTERN.match(reference): if 'CORRUPTION' not in reference and 'ECRET' not in reference and 'PARISPOINT' not in reference and 'TELCON' not in reference and 'FORTHE' not in reference and 'ZOCT' not in reference and 'ZSEP' not in reference and 'ZMAY' not in reference and 'ZNOV' not in reference and 'ZAUG' not in reference and 'PRIORITY' not in reference and 'ZJAN' not in reference and 'ZFEB' not in reference and 'ZJUN' not in reference and'ZJUL' not in reference and 'PREVIO' not in reference and 'SEPTEMBER' not in reference and 'ZAPR' not in reference and 'ZFEB' not in reference and 'PART' not in reference and 'ONFIDENTIAL' not in reference and 'SECRET' not in reference and 'SECTION' not in reference and 'TODAY' not in reference and 'DAILY' not in reference and 'OUTOF' not in reference and 'PROVIDING' not in reference and 'NUMBER' not in reference and 'APRIL' not in reference and 'OCTOBER' not in reference and 'MAIL' not in reference and 'DECEMBER' not in reference and 'FEBRUAY' not in reference and 'AUGUST' not in reference and 'MARCH' not in reference and 'JULY' not in reference and 'JUNE' not in reference and 'MAIL' not in reference and 'JANUARY' not in reference and '--' not in reference and 'PARAGRAPH' not in reference and 'ANDPREVIOUS' not in reference and 'UNCLAS' not in reference and 'ONMARCH' not in reference and 'ONAPRIL' not in reference and 'FEBRUARY' not in reference and 'ONMAY' not in reference and 'ONJULY' not in reference and 'ONJUNE' not in reference and 'NOVEMBER' not in reference and not 'CONFIDENTIAL' in reference: logger.debug('Ignore "%s". Not a valid reference identifier (%s)' % (reference, reference_id)) continue if reference != reference_id: reference = Reference(reference, consts.REF_KIND_CABLE, enum) if reference not in res: res.append(reference) return res
def parse_references(content, year, reference_id=None, canonicalize=True): """\ Returns the references to other cables as (maybe empty) list. `content` The content of the cable. `year` The year when the cable was created. `reference_id` The reference identifier of the cable. `canonicalize` Indicates if the cable reference origin should be canonicalized. (enabled by default) """ from cablemap.core.models import Reference def format_year(y): y = str(y) if not y: y = str(year) if len(y) == 4: return y[2:] elif len(y) == 3 and y[0] == '0': return y[1:] return y offset = 0 m_offset = _REF_OFFSET_PATTERN.search(content) if m_offset: offset = m_offset.end() # 1. Try to find "Classified By:" m_stop = _REF_STOP_PATTERN.search(content, offset) # If found, use it as maximum index to search for references, otherwise use a constant max_idx = m_stop and m_stop.start() or _MAX_HEADER_IDX # 2. Find references m_start = _REF_START_PATTERN.search(content, offset, max_idx) # 3. Check if we have a paragraph in the references m_stop = _REF_NOT_REF_PATTERN.search(content, m_start and m_start.end() or 0, max_idx) last_end = m_start and m_start.end() or 0 # 4. Find the next max_idx max_idx = min(m_stop and m_stop.start() or _MAX_HEADER_IDX, max_idx) m_end = _REF_LAST_REF_PATTERN.search(content, last_end, max_idx) while m_end: last_end = m_end.end() m_end = _REF_LAST_REF_PATTERN.search(content, last_end, max_idx) res = [] if m_end and not m_start: logger.warn('Found ref end but no start in "%s", content: "%s"' % (reference_id, content)) if m_start and last_end: start = m_start.start(1) end = last_end or m_start.end() refs = content[start:end].replace('\n', ' ') refs = _CLEAN_REFS_PATTERN.sub('', refs) for enum, y, origin, sn, alt_year in _REF_PATTERN.findall(refs): if alt_year and not y: y = alt_year y = format_year(y) origin = origin.replace(' ', '').replace(u"'", u'').upper() if origin == 'AND' and res and res[-1].is_cable(): last_origin = _REF_ORIGIN_PATTERN.match(res[-1].value).group(1) origin = last_origin enum = enum or res[-1].value elif origin.startswith('AND') and res and res[-1].is_cable( ): # for references like 09 FOO 1234 AND BAR 1234 origin = origin[3:] enum = enum or res[-1].value reference = u'%s%s%d' % (y, origin, int(sn)) if canonicalize: reference = canonicalize_id(reference) length = len(reference) if length < 7 or length > 25: # constants.MIN_ORIGIN_LENGTH + constants.MIN_SERIAL_LENGTH + length of year or constants.MAX_ORIGIN_LENGTH + constants.MAX_SERIAL_LENGTH + 2 (for the year) continue if not REFERENCE_ID_PATTERN.match(reference): if 'CORRUPTION' not in reference and 'ECRET' not in reference and 'PARISPOINT' not in reference and 'TELCON' not in reference and 'FORTHE' not in reference and 'ZOCT' not in reference and 'ZSEP' not in reference and 'ZMAY' not in reference and 'ZNOV' not in reference and 'ZAUG' not in reference and 'PRIORITY' not in reference and 'ZJAN' not in reference and 'ZFEB' not in reference and 'ZJUN' not in reference and 'ZJUL' not in reference and 'PREVIO' not in reference and 'SEPTEMBER' not in reference and 'ZAPR' not in reference and 'ZFEB' not in reference and 'PART' not in reference and 'ONFIDENTIAL' not in reference and 'SECRET' not in reference and 'SECTION' not in reference and 'TODAY' not in reference and 'DAILY' not in reference and 'OUTOF' not in reference and 'PROVIDING' not in reference and 'NUMBER' not in reference and 'APRIL' not in reference and 'OCTOBER' not in reference and 'MAIL' not in reference and 'DECEMBER' not in reference and 'FEBRUAY' not in reference and 'AUGUST' not in reference and 'MARCH' not in reference and 'JULY' not in reference and 'JUNE' not in reference and 'MAIL' not in reference and 'JANUARY' not in reference and '--' not in reference and 'PARAGRAPH' not in reference and 'ANDPREVIOUS' not in reference and 'UNCLAS' not in reference and 'ONMARCH' not in reference and 'ONAPRIL' not in reference and 'FEBRUARY' not in reference and 'ONMAY' not in reference and 'ONJULY' not in reference and 'ONJUNE' not in reference and 'NOVEMBER' not in reference and not 'CONFIDENTIAL' in reference: logger.debug( 'Ignore "%s". Not a valid reference identifier (%s)' % (reference, reference_id)) continue if reference != reference_id: reference = Reference(reference, consts.REF_KIND_CABLE, enum) if reference not in res: res.append(reference) return res