def parse_toc_with_dots_multiple_columns(plain_text: str) \ -> List[Tuple[str, str, int]]: lines = plain_text.split("\n") matches = [] left_column = "" right_column = "" for line in lines: match_iter = RE_TOC_WITH_DOTS.finditer(line) try: match = next(match_iter) lookbehind, id, title, page = match.groups() matches.append((lookbehind, id, title, page)) left_column += squash_whitespace(line[:match.start()]) + '\n' right_column += squash_whitespace(line[match.end():]) + '\n' except StopIteration: # NOTE: This is the weak point. We have tried counting in also # the surrounding page and section numbers. spaces = list(re.finditer(r"\s{2,}", line)) if len(spaces) != 0: largest_space = max( spaces, key=lambda match: match.end() - match.start()) left = line[:largest_space.start()] right = line[largest_space.end():] left_column += squash_whitespace(left) + '\n' right_column += squash_whitespace(right) + '\n' result = postprocess_matches(matches) result += parse_toc_with_dots(left_column) result += parse_toc_with_dots(right_column) return result
def parse_dirty(plain_text: str) -> str: """ Parse the title with potentially unsquashed whitespace and trailing garbage. """ iter = re.search(r"title:?\s+([^\n]*)", plain_text, re.IGNORECASE | re.MULTILINE) if iter: potential_title_count = squash_whitespace(plain_text).count( iter.group(1)) if potential_title_count > 5: return iter.group(1) # Arbitrary limit, title after 1000 characters would be weird. plain_text = plain_text[:1000] # For documents starting with 4 numbers only. iter = re.search(r"for\s\s+(.*?)\s\s+from", plain_text.replace("\n", " "), re.MULTILINE) if iter: return iter.group(1) # e.g. NSCIB-CC-217812-CR2 iter = re.search(r"Version [0-9]+-[0-9]+\s*(([^\n]+\n)*)", plain_text, re.MULTILINE) if iter: return iter.group(1) # e.g. nscib-cc-0229286sscdkeygen-stv1.2 if "NXP " in plain_text: iter = re.search(r"([^\n]+\n)+", plain_text, re.MULTILINE | re.DOTALL) if iter: return iter.group(0) # e.g. 0782V5b_pdf iter = re.search(r"security target[^\n]*(.*)common criteria", plain_text, re.MULTILINE | re.IGNORECASE | re.DOTALL) if iter and len(squash_whitespace(iter.group(1))) > 5: return iter.group(1) # e.g. 1110V3b_pdf iter = re.search(r"\n\n([^\n].+?\n)\n\n", plain_text, re.MULTILINE | re.DOTALL) if iter: return iter.group(1) # Last resort. iter = re.search(r"([^\n]+\n)*", plain_text, re.MULTILINE) if iter: return iter.group(0) return ""
def parse(plain_text: str) -> str: title = parse_dirty(plain_text) index = title.lower().find("security target lite") if index > 0: title = title[:index] title = squash_whitespace(title) return title
def postprocess_match(match: Tuple[str, str, str, str]) \ -> Optional[Tuple[str, str, int]]: if match[0] is not None and len(match[0]) != 0: return None id, title, page_str = tuple(group.strip() for group in match[1:]) title = squash_whitespace(title) title = re.sub(r"([A-Za-z])-\s", r"\1", title) page = int(page_str) return id, title, page
def parse_sha(plain_text: str) -> List[str]: versions = "512|384|256|224|3|2|1" plain_text = plain_text.replace(" ", "") found = re.findall( rf"SHA[-_ ]?\n?(?:{versions})(?:[-/_ ](?:{versions}))?", plain_text, re.MULTILINE ) for i in range(len(found)): found[i] = squash_whitespace(found[i]) return deduplicate_list(found)
def parse(plain_text: str) -> Dict[str, str]: references_found = set(re.findall(r"\[[0-9]*-?[0-9]*?\]", plain_text)) if len(references_found) < 5: references_found = set(re.findall(r"\[.*?\]", plain_text)) result = {} for i in references_found: definitions_found = re.findall(rf"{re.escape(i)} +([^\[]*)", plain_text) if definitions_found: result[i] = definitions_found[-1][:250] result[i] = squash_whitespace(result[i]) return result