Beispiel #1
0
def load_pyramids(eval_tar: str) -> Dict[str, Dict[str, Pyramid]]:
    pyramids = defaultdict(dict)
    with tarfile.open(eval_tar, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.startswith(
                    'GuidedSumm2011_eval/manual/pyramids'):
                path = member.name.split('/')
                filename = path[-1]
                parts = filename.split('.')
                instance_id = parts[0].split('-')[0].lower()
                group = parts[0].split('-')[1]
                if group == 'AB':
                    # There are some errors with the AB pyramids, which I think are due to
                    # errors with encoding some of the characters. There is a weird character
                    # in the files (b'\xef\xbf\xbds'). I think it messes up identifying the
                    # summary index based on the offsets (len(b'\xef\xbf\xbds') == 4, but
                    # len(b'\xef\xbf\xbds'.decode()) == 2). It will take some work to update
                    # the summaries to remove this character (I think it should be replaced
                    # with "'") and update all of the offsets.
                    continue

                xml = tar.extractfile(member).read().decode()
                pyramid = Pyramid.from_xml(f'{instance_id}-{group}', xml)
                pyramids[instance_id][group] = pyramid

    return pyramids
def load_pyramids(eval_tar: str) -> Dict[str, Dict[str, Pyramid]]:
    pyramids = defaultdict(dict)
    with tarfile.open(eval_tar, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.startswith(
                    'UpdateSumm09_eval/manual/pyramids'):
                path = member.name.split('/')
                filename = path[-1]
                parts = filename.split('.')
                instance_id = parts[0].split('-')[0].lower()
                group = parts[0].split('-')[1]

                xml = tar.extractfile(member).read().decode()
                pyramid = Pyramid.from_xml(f'{instance_id}-{group}', xml)
                pyramids[instance_id][group] = pyramid

    return pyramids
def load_pyramids(pyramid_tar: str) -> Dict[str, Dict[str, Pyramid]]:
    pyramids = {}
    with tarfile.open(pyramid_tar, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.startswith('allpyramids/'):
                path = member.name.split('/')
                filename = path[-1]
                instance_id = filename.split('.')[0].lower()

                xml = tar.extractfile(member).read().decode()
                pyramid = Pyramid.from_xml(
                    f'{instance_id}',
                    xml,
                    default_document_regex=
                    '[-]*\n(\s*)D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n')
                pyramids[instance_id] = pyramid

    return pyramids
Beispiel #4
0
def load_update_pyramids(update_tar: str) -> Dict[str, Dict[str, Pyramid]]:
    pyramids = defaultdict(dict)
    with tarfile.open(update_tar, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.startswith(
                    'updateEval/Pyramid/allpyramids'):
                path = member.name.split('/')
                filename = path[-1]
                parts = filename.split('-')
                instance_id = parts[0].lower()
                group = parts[1]

                xml = tar.extractfile(member).read().decode()
                pyramid = Pyramid.from_xml(
                    f'{instance_id}-{group}',
                    xml,
                    default_document_regex=
                    '[-]*\n D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n')
                pyramids[instance_id][group] = pyramid

    return pyramids
Beispiel #5
0
def load_pyramids(pyramid_tar: str) -> Dict[str, Pyramid]:
    pyramids = {}
    with tarfile.open(pyramid_tar, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.startswith('pans/'):
                path = member.name.split('/')
                filename = path[-1]
                instance_id = filename.split('.')[1].lower()

                # For this dataset, the pyramid and annotations are all in the same file, one per
                # annotation. Therefore, we only need to load the pyramid once from the first file
                if instance_id in pyramids:
                    continue

                xml = tar.extractfile(member).read().decode()
                pyramid = Pyramid.from_xml(
                    instance_id,
                    xml,
                    default_document_regex=
                    '[-]*\n(\s*)D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n',
                    is_combined_file=True)
                pyramids[instance_id] = pyramid

    return pyramids