def load_pyramids(eval_tar: str) -> Dict[str, Dict[str, Pyramid]]: pyramids = defaultdict(dict) with tarfile.open(eval_tar, 'r') as tar: for member in tar.getmembers(): if member.isfile() and member.name.startswith( 'GuidedSumm2011_eval/manual/pyramids'): path = member.name.split('/') filename = path[-1] parts = filename.split('.') instance_id = parts[0].split('-')[0].lower() group = parts[0].split('-')[1] if group == 'AB': # There are some errors with the AB pyramids, which I think are due to # errors with encoding some of the characters. There is a weird character # in the files (b'\xef\xbf\xbds'). I think it messes up identifying the # summary index based on the offsets (len(b'\xef\xbf\xbds') == 4, but # len(b'\xef\xbf\xbds'.decode()) == 2). It will take some work to update # the summaries to remove this character (I think it should be replaced # with "'") and update all of the offsets. continue xml = tar.extractfile(member).read().decode() pyramid = Pyramid.from_xml(f'{instance_id}-{group}', xml) pyramids[instance_id][group] = pyramid return pyramids
def load_pyramids(eval_tar: str) -> Dict[str, Dict[str, Pyramid]]: pyramids = defaultdict(dict) with tarfile.open(eval_tar, 'r') as tar: for member in tar.getmembers(): if member.isfile() and member.name.startswith( 'UpdateSumm09_eval/manual/pyramids'): path = member.name.split('/') filename = path[-1] parts = filename.split('.') instance_id = parts[0].split('-')[0].lower() group = parts[0].split('-')[1] xml = tar.extractfile(member).read().decode() pyramid = Pyramid.from_xml(f'{instance_id}-{group}', xml) pyramids[instance_id][group] = pyramid return pyramids
def load_pyramids(pyramid_tar: str) -> Dict[str, Dict[str, Pyramid]]: pyramids = {} with tarfile.open(pyramid_tar, 'r') as tar: for member in tar.getmembers(): if member.isfile() and member.name.startswith('allpyramids/'): path = member.name.split('/') filename = path[-1] instance_id = filename.split('.')[0].lower() xml = tar.extractfile(member).read().decode() pyramid = Pyramid.from_xml( f'{instance_id}', xml, default_document_regex= '[-]*\n(\s*)D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n') pyramids[instance_id] = pyramid return pyramids
def load_update_pyramids(update_tar: str) -> Dict[str, Dict[str, Pyramid]]: pyramids = defaultdict(dict) with tarfile.open(update_tar, 'r') as tar: for member in tar.getmembers(): if member.isfile() and member.name.startswith( 'updateEval/Pyramid/allpyramids'): path = member.name.split('/') filename = path[-1] parts = filename.split('-') instance_id = parts[0].lower() group = parts[1] xml = tar.extractfile(member).read().decode() pyramid = Pyramid.from_xml( f'{instance_id}-{group}', xml, default_document_regex= '[-]*\n D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n') pyramids[instance_id][group] = pyramid return pyramids
def load_pyramids(pyramid_tar: str) -> Dict[str, Pyramid]: pyramids = {} with tarfile.open(pyramid_tar, 'r') as tar: for member in tar.getmembers(): if member.isfile() and member.name.startswith('pans/'): path = member.name.split('/') filename = path[-1] instance_id = filename.split('.')[1].lower() # For this dataset, the pyramid and annotations are all in the same file, one per # annotation. Therefore, we only need to load the pyramid once from the first file if instance_id in pyramids: continue xml = tar.extractfile(member).read().decode() pyramid = Pyramid.from_xml( instance_id, xml, default_document_regex= '[-]*\n(\s*)D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n', is_combined_file=True) pyramids[instance_id] = pyramid return pyramids