Example #1
0
def parse_textregion(text_region_dict: dict) -> PageXMLTextRegion:
    text_region = PageXMLTextRegion(
        doc_id=text_region_dict['@id'] if '@id' in text_region_dict else None,
        orientation=float(text_region_dict['@orientation']) if '@orientation' in text_region_dict else None,
        coords=parse_coords(text_region_dict['Coords']) if 'Coords' in text_region_dict else None,
        metadata=parse_custom_metadata(text_region_dict) if '@custom' in text_region_dict else None,
    )
    if text_region.metadata and 'type' in text_region.metadata:
        text_region.add_type(text_region.metadata['type'])
    for child in text_region_dict:
        if child == 'TextLine':
            if isinstance(text_region_dict['TextLine'], list):
                text_region.lines = parse_textline_list(text_region_dict['TextLine'])
            else:
                text_region.lines = [parse_textline(text_region_dict['TextLine'])]
            if not text_region.coords:
                text_region.coords = parse_derived_coords(text_region.lines)
        if child == 'TextRegion':
            if isinstance(text_region_dict['TextRegion'], list):
                text_region.text_regions = parse_textregion_list(text_region_dict['TextRegion'])
            else:
                text_region.text_regions = [parse_textregion(text_region_dict['TextRegion'])]
            if not text_region.coords:
                text_region.coords = parse_derived_coords(text_region.text_regions)
    return text_region
def get_split_columns(
        split: Dict[str, any],
        first_paragraph_num_lines: int) -> Tuple[List[dict], List[dict]]:
    line_count = 0
    first_columns, next_columns = [], []
    for column in split['resolution'].columns:
        if column['metadata']['id'] not in split['paragraph'].column_ids:
            continue
        column = copy.deepcopy(column)
        first_trs, next_trs = [], []
        for tr in column['textregions']:
            first_lines, next_lines = [], []
            for line in tr['lines']:
                line_count += 1
                if line_count <= first_paragraph_num_lines:
                    first_lines.append(line)
                else:
                    next_lines.append(line)
            if len(next_lines) == 0:
                first_trs.append(tr)
            elif len(first_lines) == 0:
                next_trs.append(tr)
            else:
                first_tr = tr
                next_tr = copy.deepcopy(tr)
                first_tr['lines'] = first_lines
                next_tr['lines'] = next_lines
                first_tr['coords'] = parse_derived_coords(first_lines)
                next_tr['coords'] = parse_derived_coords(next_lines)
                first_trs.append(first_tr)
                next_trs.append(next_tr)
        if len(next_trs) == 0:
            first_columns.append(column)
        elif len(first_trs) == 0:
            next_columns.append(column)
        else:
            first_column = column
            next_column = copy.deepcopy(column)
            first_column['textregions'] = first_trs
            next_column['textregions'] = next_trs
            first_column['coords'] = parse_derived_coords(first_trs)
            next_column['coords'] = parse_derived_coords(next_trs)
            first_columns.append(first_column)
            next_columns.append(next_column)
    return first_columns, next_columns
def parse_republic_pagexml_file(pagexml_file: str) -> PageXMLScan:
    try:
        scan_doc = pagexml_parser.parse_pagexml_file(pagexml_file)
        metadata = file_parser.get_republic_scan_metadata(pagexml_file)
        for field in metadata:
            scan_doc.metadata[field] = metadata[field]
        if 'coords' not in scan_doc:
            scan_doc.coords = parse_derived_coords(scan_doc.text_regions)
        return scan_doc
    except (AssertionError, KeyError, TypeError, ValueError):
        print(f"Error parsing file {pagexml_file}")
        raise
def reconstruct_columns(resolution: Resolution) -> List[Dict[str, any]]:
    line_ids = []
    new_columns: List[Dict[str, any]] = []
    for para in resolution.paragraphs:
        for line_range in para.line_ranges:
            line_ids.append(line_range['line_id'])
    for column in resolution.columns:
        new_column = copy.deepcopy(column)
        for tr in new_column['textregions']:
            new_lines = [
                line for line in tr['lines']
                if line['metadata']['id'] in line_ids
            ]
            tr['lines'] = new_lines
            tr['coords'] = parse_derived_coords(new_lines)
        new_column['textregions'] = [
            tr for tr in new_column['textregions'] if len(tr['lines']) > 0
        ]
        new_column['coords'] = parse_derived_coords(new_column['textregions'])
        if len(new_column['textregions']) > 0:
            new_columns.append(new_column)
    return new_columns
def generate_session_doc(session_metadata: dict,
                         session_lines: List[PageXMLTextLine],
                         session_searcher: SessionSearcher,
                         column_metadata: Dict[str, dict]) -> iter:
    evidence = session_metadata['evidence']
    del session_metadata['evidence']
    text_region_lines = defaultdict(list)
    text_regions: List[PageXMLTextRegion] = []
    for line in session_lines:
        text_region_id = line.metadata['column_id']
        text_region_lines[text_region_id].append(line)
    for text_region_id in text_region_lines:
        metadata = column_metadata[text_region_id]
        coords = parse_derived_coords(text_region_lines[text_region_id])
        text_region = PageXMLTextRegion(
            doc_id=text_region_id,
            metadata=metadata,
            coords=coords,
            lines=text_region_lines[text_region_id])
        text_regions.append(text_region)
    session = Session(metadata=session_metadata,
                      text_regions=text_regions,
                      evidence=evidence)
    # session.add_page_text_region_metadata(column_metadata)
    # add number of lines to session info in session searcher
    session_info = session_searcher.sessions[
        session_metadata['session_date']][-1]
    session_info['num_lines'] = len(session_lines)
    if session.date.is_rest_day(
    ) or not session_searcher.has_session_date_match():
        return session
    # Check if the next session date is more than 1 workday ahead
    date_match = session_searcher.get_session_date_match()
    new_date = derive_date_from_string(date_match.phrase.phrase_string,
                                       session_searcher.year)
    if session.date.isoformat() == new_date.isoformat():
        # print('SAME DAY:', session_searcher.current_date.isoformat(), '\t', session.date.isoformat())
        return session
    workday_shift = calculate_work_day_shift(new_date, session.date)
    # print('workday_shift:', workday_shift)
    if workday_shift > 1:
        print('MEETING DOC IS MULTI DAY')
        session.metadata['date_shift_status'] = 'multi_day'
    return session
def make_paragraph_line_annotations(paragraph: RepublicParagraph, doc_text_offset: int,
                                    line_index: Dict[str, PageXMLTextLine]) -> List[Dict[str, any]]:
    annotations = []
    tr_lines = defaultdict(list)
    for line_range in paragraph.line_ranges:
        line = line_index[line_range['line_id']]
        tr_lines[line.metadata['column_id']].append(line_range)
    for column_id in tr_lines:
        coords = parse_derived_coords([line_index[line_range['line_id']] for line_range in tr_lines[column_id]])
        first_line = line_index[tr_lines[column_id][0]['line_id']]
        tr_id = first_line.metadata['scan_id'] + f"-text_region-{coords.x}-{coords.y}-{coords.w}-{coords.h}"
        tr_anno = {
            'id': tr_id,
            'type': 'text_region',
            'coords': coords.points,
            'start_offset': doc_text_offset + tr_lines[column_id][0]['start'],
            'end_offset': doc_text_offset + tr_lines[column_id][-1]['end'],
            'metadata': {
                'para_id': paragraph.metadata['id'],
                'scan_id': first_line.metadata['scan_id']
            }
        }
        annotations.append(tr_anno)
        for line_range in tr_lines[column_id]:
            para_offset = line_range['start']
            para_end = line_range['end']
            # line_anno = line_index[line_range['line_id']].json
            # line_anno['type'] = 'line'
            line_anno = {
                'id': line_range['line_id'],
                'type': 'line',
                'start_offset': doc_text_offset + para_offset,
                'end_offset': doc_text_offset + para_end,
                "metadata": {
                    'text_region_id': tr_id,
                    'para_id': paragraph.metadata['id'],
                    'scan_id': line_index[line_range['line_id']].metadata['scan_id']
                },
                "coords": line_index[line_range['line_id']].coords.points
            }
            annotations.append(line_anno)
    # for line_range in paragraph.line_ranges:
    return annotations
def get_scan_pagexml(pagexml_file: str,
                     inventory_config: dict,
                     pagexml_data: Union[str, None] = None) -> PageXMLScan:
    # print('Parsing file', pagexml_file)
    try:
        scan_doc = pagexml_parser.parse_pagexml_file(pagexml_file,
                                                     pagexml_data=pagexml_data)
        scan_doc.reading_order = {}
    except (AssertionError, KeyError, TypeError):
        print('Error parsing file', pagexml_file)
        raise
    if not scan_doc.coords and scan_doc.text_regions:
        # add scan coordinates if they're not in the XML
        scan_doc.coords = parse_derived_coords(scan_doc.text_regions)
    for text_region in scan_doc.text_regions:
        if text_region.types.intersection({'date', 'page-number'}):
            text_region.add_type(['header', 'extra'])
        elif text_region.types.intersection({'catch-word', 'signature-mark'}):
            text_region.add_type(['footer', 'extra'])
        elif text_region.types.intersection({'separator'}):
            text_region.add_type(['extra'])
        elif text_region.types.intersection({'index', 'respect',
                                             'resolution'}):
            text_region.add_type(['main'])
    metadata = file_parser.get_republic_scan_metadata(pagexml_file)
    scan_doc.id = metadata['id']
    for field in metadata:
        scan_doc.metadata[field] = metadata[field]
    if scan_doc.coords.right == 0:
        scan_doc.metadata['scan_type'] = ['empty_scan']
    elif scan_doc.coords.right < 2500:
        scan_doc.metadata['scan_type'] = ['single_page']
    elif scan_doc.coords.right < 4900:
        scan_doc.metadata['scan_type'] = ['double_page']
    else:
        scan_doc.metadata['scan_type'] = ['special_page']
    set_document_children_derived_ids(scan_doc, scan_doc.id)
    return scan_doc
def split_column_regions(page_doc: PageXMLPage) -> PageXMLPage:
    column_metadata = {
        'page_id': page_doc.metadata['id'],
        'scan_id': page_doc.metadata['scan_id'],
        'type': ['column', 'pagexml_doc', 'text_region']
    }
    extra_metadata = copy.deepcopy(column_metadata)
    extra_metadata['type'] = 'header'
    columns: List[PageXMLColumn] = []
    extra_text_regions: List[PageXMLTextRegion] = []
    text_regions: List[PageXMLTextRegion] = []
    for text_region in page_doc.text_regions:
        text_regions += [text_region
                         ] if text_region.lines else text_region.text_regions
    text_regions.sort(key=lambda x: x.coords.top)
    for text_region in text_regions:
        if text_region.lines and text_region.coords.width > 1200:
            # Wide text_regions are part of the header
            extra_text_regions += [text_region]
            continue
        # check if this text region overlaps with an existing column
        overlapping_column = None
        for column in columns:
            overlap = coords_overlap(column, text_region)
            #      column['coords']['left'], column['coords']['right'])
            tr_overlap_frac = overlap / text_region.coords.width
            cl_overlap_frac = overlap / column.coords.width
            if min(tr_overlap_frac, cl_overlap_frac) > 0.5 and max(
                    tr_overlap_frac, cl_overlap_frac) > 0.75:
                overlapping_column = column
                break
        # if there is an overlapping column, add this text region
        if overlapping_column:
            overlapping_column.text_regions += [text_region]
            overlapping_column.coords = parse_derived_coords(
                overlapping_column.text_regions)
        # if no, create a new column for this text region
        else:
            column = PageXMLColumn(coords=parse_derived_coords([text_region]),
                                   metadata=column_metadata,
                                   text_regions=[text_region])
            columns += [column]
    for column in columns:
        if not column.coords:
            print('COLUMN NO COORDS:', column)
            raise KeyError('Column has no "coords" property.')
    columns.sort(key=lambda x: x.coords.left)
    for ci, column in enumerate(columns):
        column.text_regions.sort(key=lambda x: x.coords.top)
        column.metadata = column_metadata
        column.set_derived_id(column.metadata['scan_id'])
        set_line_alignment(column)
        column.metadata['iiif_url'] = derive_pagexml_page_iiif_url(
            page_doc.metadata['jpg_url'], column.coords)
    if extra_text_regions:
        extra_coords = parse_derived_coords(extra_text_regions)
        extra = PageXMLTextRegion(metadata=extra_metadata,
                                  coords=extra_coords,
                                  text_regions=extra_text_regions)
        extra.main_type = 'extra'
        extra.metadata['iiif_url'] = derive_pagexml_page_iiif_url(
            page_doc.metadata['jpg_url'], extra.coords)
        extra.set_derived_id(extra.metadata['scan_id'])
    else:
        extra = None
    new_page = PageXMLPage(doc_id=page_doc.id,
                           doc_type=page_doc.type,
                           coords=page_doc.coords,
                           metadata=page_doc.metadata,
                           columns=columns,
                           extra=extra_text_regions)
    new_page.set_parent(page_doc.parent)
    return new_page
def split_scan_pages(scan_doc: PageXMLScan) -> List[PageXMLPage]:
    pages: List[PageXMLPage] = []
    if not scan_doc.text_regions:
        return pages
    page_odd = initialize_pagexml_page(scan_doc, 'odd')
    page_even = initialize_pagexml_page(scan_doc, 'even')
    # page_extra = initialize_pagexml_page(scan_doc, 'extra')
    for text_region in scan_doc.text_regions:
        text_region.metadata['scan_id'] = scan_doc.id
        if text_region.metadata and 'type' in text_region.metadata:
            if is_even_side(text_region):
                page_even.add_child(text_region)
                # print("stats after adding child", page_even.stats)
            elif is_odd_side(text_region):
                page_odd.add_child(text_region)
                # print("stats after adding child", page_odd.stats)
        elif text_region.lines:
            even_lines = [
                line for line in text_region.lines if is_even_side(line)
            ]
            odd_lines = [
                line for line in text_region.lines if is_odd_side(line)
            ]
            if len(even_lines) == 0:
                page_odd.add_child(text_region)
                # print("stats after adding child", page_odd.stats)
            elif len(odd_lines) == 0:
                page_even.add_child(text_region)
                # print("stats after adding child", page_even.stats)
            else:
                # The text region crosses the page boundary. Split the lines into new text regions per
                # page, and create new text regions
                odd_region = PageXMLTextRegion(
                    lines=odd_lines,
                    coords=parse_derived_coords(odd_lines),
                    metadata=text_region.metadata)
                even_region = PageXMLTextRegion(
                    lines=even_lines,
                    coords=parse_derived_coords(even_lines),
                    metadata=text_region.metadata)
                page_even.add_child(even_region)
                # print("stats after adding child", page_even.stats)
                page_odd.add_child(odd_region)
                # print("stats after adding child", page_odd.stats)
        elif text_region.text_regions:
            even_text_regions = [
                text_region for text_region in text_region.text_regions
                if is_even_side(text_region)
            ]
            odd_text_regions = [
                text_region for text_region in text_region.text_regions
                if is_odd_side(text_region)
            ]
            if len(even_text_regions) == 0:
                page_odd.add_child(text_region)
                # print("stats after adding child", page_odd.stats)
            elif len(odd_text_regions) == 0:
                page_even.add_child(text_region)
                # print("stats after adding child", page_even.stats)
            else:
                # The text region crosses the page boundary. Split the text_regions into new text regions per
                # page, and create new text regions
                odd_region = PageXMLTextRegion(
                    text_regions=odd_text_regions,
                    metadata=text_region.metadata,
                    coords=parse_derived_coords(odd_text_regions))
                even_region = PageXMLTextRegion(
                    text_regions=even_text_regions,
                    metadata=text_region.metadata,
                    coords=parse_derived_coords(even_text_regions))
                page_even.add_child(even_region)
                # print("stats after adding child", page_even.stats)
                page_odd.add_child(odd_region)
                # print("stats after adding child", page_odd.stats)
    for page_doc in [page_even, page_odd]:
        if not page_doc.coords:
            if len(page_doc.columns):
                page_doc.coords = parse_derived_coords(page_doc.columns)
            elif len(page_doc.text_regions):
                page_doc.coords = parse_derived_coords(page_doc.text_regions)
            if page_doc.coords:
                page_doc.metadata['iiif_url'] = derive_pagexml_page_iiif_url(
                    page_doc.metadata['jpg_url'], page_doc.coords)
            else:
                page_doc.metadata['iiif_url'] = scan_doc.metadata['iiif_url']
        pages += [page_doc]
    return pages
Example #10
0
def add_missing_dates(prev_date: RepublicDate, session: Session):
    missing = (session.date - prev_date).days - 1
    if missing > 0:
        print('missing days:', missing)
    for diff in range(1, missing + 1):
        # create a new meeting doc for the missing date, with data copied from the current meeting
        # as most likely the missing date is a non-meeting date with 'nihil actum est'
        missing_date = prev_date.date + datetime.timedelta(days=diff)
        missing_date = RepublicDate(missing_date.year, missing_date.month,
                                    missing_date.day)
        missing_session = copy.deepcopy(session)
        missing_session.metadata[
            'id'] = f'session-{missing_date.isoformat()}-session-1'
        missing_session.id = missing_session.metadata['id']
        missing_session.metadata['session_date'] = missing_date.isoformat()
        missing_session.metadata['year'] = missing_date.year
        missing_session.metadata['session_month'] = missing_date.month
        missing_session.metadata['session_day'] = missing_date.day
        missing_session.metadata['session_weekday'] = missing_date.day_name
        missing_session.metadata['is_workday'] = missing_date.is_work_day()
        missing_session.metadata['session'] = None
        missing_session.metadata['president'] = None
        missing_session.metadata['attendants_list_id'] = None
        evidence_lines = set(
            [evidence['line_id'] for evidence in missing_session.evidence])
        keep_columns = []
        num_lines = 0
        num_words = 0
        missing_session.lines = []
        for column in missing_session.columns:
            keep_textregions = []
            for textregion in column['textregions']:
                keep_lines = []
                for line in textregion['lines']:
                    if len(evidence_lines) > 0:
                        keep_lines += [line]
                        missing_session.lines += [line]
                        num_lines += 1
                        if line['text']:
                            num_words += len([
                                word
                                for word in re.split(r'\W+', line['text'])
                                if word != ''
                            ])
                    else:
                        break
                    if line['metadata']['id'] in evidence_lines:
                        evidence_lines.remove(line['metadata']['id'])
                textregion['lines'] = keep_lines
                if len(textregion['lines']) > 0:
                    textregion['coords'] = parse_derived_coords(
                        textregion['lines'])
                    keep_textregions += [textregion]
            column['textregions'] = keep_textregions
            if len(column['textregions']) > 0:
                column['coords'] = parse_derived_coords(column['textregions'])
                keep_columns += [column]
        missing_session.columns = keep_columns
        missing_session.metadata['num_columns'] = len(missing_session.columns)
        missing_session.metadata['num_lines'] = num_lines
        missing_session.metadata['num_words'] = num_words
        missing_session.scan_versions = get_session_scans_version(
            missing_session)
        session_parser.clean_lines(missing_session.lines, clean_copy=False)
        print('missing session:', missing_session.id)
        yield missing_session