コード例 #1
0
def parse_toc_with_dots_multiple_columns(plain_text: str) \
        -> List[Tuple[str, str, int]]:
    lines = plain_text.split("\n")

    matches = []
    left_column = ""
    right_column = ""

    for line in lines:
        match_iter = RE_TOC_WITH_DOTS.finditer(line)

        try:
            match = next(match_iter)
            lookbehind, id, title, page = match.groups()
            matches.append((lookbehind, id, title, page))
            left_column += squash_whitespace(line[:match.start()]) + '\n'
            right_column += squash_whitespace(line[match.end():]) + '\n'
        except StopIteration:
            # NOTE: This is the weak point. We have tried counting in also
            # the surrounding page and section numbers.
            spaces = list(re.finditer(r"\s{2,}", line))
            if len(spaces) != 0:
                largest_space = max(
                    spaces, key=lambda match: match.end() - match.start())
                left = line[:largest_space.start()]
                right = line[largest_space.end():]
                left_column += squash_whitespace(left) + '\n'
                right_column += squash_whitespace(right) + '\n'

    result = postprocess_matches(matches)
    result += parse_toc_with_dots(left_column)
    result += parse_toc_with_dots(right_column)
    return result
コード例 #2
0
ファイル: title_parser.py プロジェクト: shoracek/PySer
def parse_dirty(plain_text: str) -> str:
    """
    Parse the title with potentially unsquashed whitespace and trailing
    garbage.
    """

    iter = re.search(r"title:?\s+([^\n]*)", plain_text,
                     re.IGNORECASE | re.MULTILINE)
    if iter:
        potential_title_count = squash_whitespace(plain_text).count(
            iter.group(1))
        if potential_title_count > 5:
            return iter.group(1)

    # Arbitrary limit, title after 1000 characters would be weird.
    plain_text = plain_text[:1000]

    # For documents starting with 4 numbers only.
    iter = re.search(r"for\s\s+(.*?)\s\s+from", plain_text.replace("\n", " "),
                     re.MULTILINE)
    if iter:
        return iter.group(1)

    # e.g. NSCIB-CC-217812-CR2
    iter = re.search(r"Version [0-9]+-[0-9]+\s*(([^\n]+\n)*)", plain_text,
                     re.MULTILINE)
    if iter:
        return iter.group(1)

    # e.g. nscib-cc-0229286sscdkeygen-stv1.2
    if "NXP " in plain_text:
        iter = re.search(r"([^\n]+\n)+", plain_text, re.MULTILINE | re.DOTALL)
        if iter:
            return iter.group(0)

    # e.g. 0782V5b_pdf
    iter = re.search(r"security target[^\n]*(.*)common criteria", plain_text,
                     re.MULTILINE | re.IGNORECASE | re.DOTALL)
    if iter and len(squash_whitespace(iter.group(1))) > 5:
        return iter.group(1)

    # e.g. 1110V3b_pdf
    iter = re.search(r"\n\n([^\n].+?\n)\n\n", plain_text,
                     re.MULTILINE | re.DOTALL)
    if iter:
        return iter.group(1)

    # Last resort.
    iter = re.search(r"([^\n]+\n)*", plain_text, re.MULTILINE)
    if iter:
        return iter.group(0)

    return ""
コード例 #3
0
ファイル: title_parser.py プロジェクト: shoracek/PySer
def parse(plain_text: str) -> str:
    title = parse_dirty(plain_text)

    index = title.lower().find("security target lite")
    if index > 0:
        title = title[:index]

    title = squash_whitespace(title)
    return title
コード例 #4
0
def postprocess_match(match: Tuple[str, str, str, str]) \
        -> Optional[Tuple[str, str, int]]:
    if match[0] is not None and len(match[0]) != 0:
        return None

    id, title, page_str = tuple(group.strip() for group in match[1:])
    title = squash_whitespace(title)
    title = re.sub(r"([A-Za-z])-\s", r"\1", title)
    page = int(page_str)
    return id, title, page
コード例 #5
0
ファイル: versions_parser.py プロジェクト: shoracek/PySer
def parse_sha(plain_text: str) -> List[str]:
    versions = "512|384|256|224|3|2|1"

    plain_text = plain_text.replace(" ", "")
    found = re.findall(
        rf"SHA[-_ ]?\n?(?:{versions})(?:[-/_ ](?:{versions}))?",
        plain_text, re.MULTILINE
    )

    for i in range(len(found)):
        found[i] = squash_whitespace(found[i])
    return deduplicate_list(found)
コード例 #6
0
ファイル: bibliography_parser.py プロジェクト: shoracek/PySer
def parse(plain_text: str) -> Dict[str, str]:
    references_found = set(re.findall(r"\[[0-9]*-?[0-9]*?\]", plain_text))
    if len(references_found) < 5:
        references_found = set(re.findall(r"\[.*?\]", plain_text))

    result = {}
    for i in references_found:
        definitions_found = re.findall(rf"{re.escape(i)} +([^\[]*)",
                                       plain_text)
        if definitions_found:
            result[i] = definitions_found[-1][:250]
            result[i] = squash_whitespace(result[i])

    return result