def load_pyramids(eval_tar: str) -> Dict[str, Dict[str, Pyramid]]: pyramids = defaultdict(dict) with tarfile.open(eval_tar, 'r') as tar: for member in tar.getmembers(): if member.isfile() and member.name.startswith( 'GuidedSumm2011_eval/manual/pyramids'): path = member.name.split('/') filename = path[-1] parts = filename.split('.') instance_id = parts[0].split('-')[0].lower() group = parts[0].split('-')[1] if group == 'AB': # There are some errors with the AB pyramids, which I think are due to # errors with encoding some of the characters. There is a weird character # in the files (b'\xef\xbf\xbds'). I think it messes up identifying the # summary index based on the offsets (len(b'\xef\xbf\xbds') == 4, but # len(b'\xef\xbf\xbds'.decode()) == 2). It will take some work to update # the summaries to remove this character (I think it should be replaced # with "'") and update all of the offsets. continue xml = tar.extractfile(member).read().decode() pyramid = Pyramid.from_xml(f'{instance_id}-{group}', xml) pyramids[instance_id][group] = pyramid return pyramids
def load_pyramids(eval_tar: str) -> Dict[str, Dict[str, Pyramid]]: pyramids = defaultdict(dict) with tarfile.open(eval_tar, 'r') as tar: for member in tar.getmembers(): if member.isfile() and member.name.startswith( 'UpdateSumm09_eval/manual/pyramids'): path = member.name.split('/') filename = path[-1] parts = filename.split('.') instance_id = parts[0].split('-')[0].lower() group = parts[0].split('-')[1] xml = tar.extractfile(member).read().decode() pyramid = Pyramid.from_xml(f'{instance_id}-{group}', xml) pyramids[instance_id][group] = pyramid return pyramids
def load_pyramids(pyramid_tar: str) -> Dict[str, Dict[str, Pyramid]]: pyramids = {} with tarfile.open(pyramid_tar, 'r') as tar: for member in tar.getmembers(): if member.isfile() and member.name.startswith('allpyramids/'): path = member.name.split('/') filename = path[-1] instance_id = filename.split('.')[0].lower() xml = tar.extractfile(member).read().decode() pyramid = Pyramid.from_xml( f'{instance_id}', xml, default_document_regex= '[-]*\n(\s*)D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n') pyramids[instance_id] = pyramid return pyramids
def load_update_pyramids(update_tar: str) -> Dict[str, Dict[str, Pyramid]]: pyramids = defaultdict(dict) with tarfile.open(update_tar, 'r') as tar: for member in tar.getmembers(): if member.isfile() and member.name.startswith( 'updateEval/Pyramid/allpyramids'): path = member.name.split('/') filename = path[-1] parts = filename.split('-') instance_id = parts[0].lower() group = parts[1] xml = tar.extractfile(member).read().decode() pyramid = Pyramid.from_xml( f'{instance_id}-{group}', xml, default_document_regex= '[-]*\n D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n') pyramids[instance_id][group] = pyramid return pyramids
def score(self, annotation: PyramidAnnotation, pyramid: Pyramid) -> MetricsDict: # Create a mapping from the SCU id to its weight and count how many are at each weight scu_id_to_weight = {} weight_to_num_scus = Counter() for scu in pyramid.scus: weight = scu.get_weight() scu_id_to_weight[scu.scu_id] = weight weight_to_num_scus[weight] += 1 # Calculate the total weight of the SCUs in the annotation total_weight = 0 for scu in annotation.scus: # It's possible the SCU id isn't in the Pyramid, for example, if we are # doing jackknifing and the reference corresponding to an SCU of weight 1 was removed if scu.scu_id in scu_id_to_weight: total_weight += scu_id_to_weight[scu.scu_id] # Calculate the average number of SCUs in the pyramid summaries total_scus = 0 for i in range(len(pyramid.summarizer_ids)): total_scus += len(pyramid.get_scu_id_set(i)) average_num_scus = total_scus / len(pyramid.summarizer_ids) # Calculate the weight of an ideal summary with `average_num_scus` SCUs ideal_weight = 0 scus_remaining = int(math.ceil(average_num_scus)) for weight in sorted(weight_to_num_scus.keys(), reverse=True): if scus_remaining <= 0: break possible_scus = weight_to_num_scus[weight] num_scus_taken = min(scus_remaining, possible_scus) ideal_weight += num_scus_taken * weight scus_remaining -= num_scus_taken # The modified pyramid score is the ratio of the weight to the ideal weight return MetricsDict({self.name: total_weight / ideal_weight})
def load_pyramids(pyramid_tar: str) -> Dict[str, Pyramid]: pyramids = {} with tarfile.open(pyramid_tar, 'r') as tar: for member in tar.getmembers(): if member.isfile() and member.name.startswith('pans/'): path = member.name.split('/') filename = path[-1] instance_id = filename.split('.')[1].lower() # For this dataset, the pyramid and annotations are all in the same file, one per # annotation. Therefore, we only need to load the pyramid once from the first file if instance_id in pyramids: continue xml = tar.extractfile(member).read().decode() pyramid = Pyramid.from_xml( instance_id, xml, default_document_regex= '[-]*\n(\s*)D[0-9]*\.M\.250\.[A-Z]\.[A-Z]\n[-]*\n', is_combined_file=True) pyramids[instance_id] = pyramid return pyramids
def _get_scu_intersection(self, annotation: PyramidAnnotation, pyramid: Pyramid, index: int) -> Set[int]: annotation_scus = annotation.get_scu_id_set() reference_scus = pyramid.get_scu_id_set(index) return annotation_scus & reference_scus