def validate_responses(args): logger = Logger(args.log, args.log_specifications, sys.argv) logger.record_event('DEFAULT_INFO', 'validation started') document_mappings = DocumentMappings( logger, args.parent_children, Encodings(logger, args.encodings), CoreDocuments(logger, args.core_documents)) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } queries = TA3QuerySet(logger, args.queries) if args.queries else None responses = ResponseSet(logger, document_mappings, document_boundaries, args.input, args.runid, args.task, queries=queries) responses.write_valid_responses(args.output) num_warnings, num_errors = logger.get_stats() closing_message = 'validation finished (warnings:{}, errors:{})'.format( num_warnings, num_errors) logger.record_event('DEFAULT_INFO', closing_message) print(closing_message) if num_errors > 0: exit(ERROR_EXIT_CODE) exit(ALLOK_EXIT_CODE)
def generate_confidence_intervals(args): logger = Logger(args.log, args.log_specifications, sys.argv) if not args.input.endswith('.tab'): logger.record_event('DEFAULT_CRITICAL_ERROR', 'input filename should be a *.tab.') aggregate = {} for element in args.aggregate.split(','): key, value = element.split(':') if key not in aggregate: aggregate[key] = [] aggregate[key].append(value) confidence_interval = ConfidenceIntervals(logger, macro=args.macro, input=args.input, primary_key_col=args.primary_key, score=args.score, aggregate=aggregate, document_id_col=args.document_id, run_id_col=args.run_id, sizes=args.sizes, seed_value=args.seed) output = {'pretty': args.pretty_output, 'tab': args.tab_output} for output_format in output: fh = open(output[output_format], 'w') fh.write(confidence_interval.get('output', output_format)) fh.close() exit(ALLOK_EXIT_CODE)
def align_clusters(args): logger = Logger(args.log, args.log_specifications, sys.argv) document_mappings = DocumentMappings( logger, args.parent_children, Encodings(logger, args.encodings), CoreDocuments(logger, args.core_documents)) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } annotated_regions = AnnotatedRegions(logger, document_mappings, document_boundaries, args.regions) os.mkdir(args.similarities) os.mkdir(args.alignment) for entry in sorted(os.scandir(args.gold), key=str): if entry.is_dir() and entry.name.endswith('.ttl'): kb = entry.name message = 'aligning clusters in {}'.format(entry.name) logger.record_event('DEFAULT_INFO', message) print('At {}: {}'.format( time.strftime("%m/%d/%Y %H:%M:%S", time.localtime()), message)) document_id = kb.replace('.ttl', '') gold_mentions = '{}/{}/AIDA_P2_TA1_CM_A0001.rq.tsv'.format( args.gold, kb) gold_edges = '{}/{}/AIDA_P2_TA1_AM_A0001.rq.tsv'.format( args.gold, kb) system_mentions = '{}/{}/AIDA_P2_TA1_CM_A0001.rq.tsv'.format( args.system, kb) system_edges = '{}/{}/AIDA_P2_TA1_AM_A0001.rq.tsv'.format( args.system, kb) gold_mentions = gold_mentions if os.path.exists( gold_mentions) else None gold_edges = gold_edges if os.path.exists(gold_edges) else None system_mentions = system_mentions if os.path.exists( system_mentions) else None system_edges = system_edges if os.path.exists( system_edges) else None similarities = '{}/{}.tab'.format(args.similarities, document_id) alignment = '{}/{}.tab'.format(args.alignment, document_id) check_for_paths_non_existance([similarities, alignment]) clusters = Clusters(logger, document_mappings, document_boundaries, annotated_regions, gold_mentions, gold_edges, system_mentions, system_edges) clusters.print_similarities(similarities) clusters.print_alignment(alignment) exit(ALLOK_EXIT_CODE)
def filter_responses(args): logger = Logger(args.log, args.log_specifications, sys.argv) ontology_type_mappings = OntologyTypeMappings(logger, args.ontology_type_mappings) slot_mappings = SlotMappings(logger, args.slot_mappings) document_mappings = DocumentMappings( logger, args.parent_children, Encodings(logger, args.encodings), CoreDocuments(logger, args.core_documents)) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } responses = ResponseSet(logger, ontology_type_mappings, slot_mappings, document_mappings, document_boundaries, args.input, args.runid) annotated_regions = AnnotatedRegions(logger, document_mappings, document_boundaries, args.regions) run_filter_on_all_responses(responses, annotated_regions, document_mappings, document_boundaries) os.mkdir(args.output) for input_filename in responses: output_filename = input_filename.replace(responses.get('path'), args.output) dirname = os.path.dirname(output_filename) if not os.path.exists(dirname): os.mkdir(dirname) output_fh = open(output_filename, 'w') header_printed = False for linenum in sorted(responses.get(input_filename), key=int): entry = responses.get(input_filename).get(str(linenum)) if not header_printed: output_fh.write('{}\n'.format(entry.get('header').get('line'))) header_printed = True if not entry.get('valid'): logger.record_event('EXPECTING_VALID_ENTRY', entry.get('where')) continue if entry.get('passes_filter'): output_fh.write(entry.__str__()) output_fh.close() exit(ALLOK_EXIT_CODE)
def main(args): logger = Logger(args.log, args.log_specifications, sys.argv) type_mappings = Container(logger) for entry in FileHandler(logger, args.ontology_type_mappings): type_mappings.add(key=entry.get('full_type_ov'), value=entry.get('full_type')) text_boundaries = TextBoundaries(logger, args.sentence_boundaries) image_boundaries = ImageBoundaries(logger, args.image_boundaries) video_boundaries = VideoBoundaries(logger, args.video_boundaries) keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries) document_boundaries = { 'text': text_boundaries, 'image': image_boundaries, 'keyframe': keyframe_boundaries, 'video': video_boundaries } output = [] for entry in FileHandler(logger, args.input): document_id = entry.get('root_doc_id') document_element_id = entry.get('doc_element_id') modality = entry.get('media_type') type = entry.get('type') subtype = entry.get('subtype') subsubtype = entry.get('subsubtype') # apply patch to correct LDC's mistake in annotation if type == 'personalsocial' and subtype == 'unspecified': subtype = 'relationship' full_type = '{type}.{subtype}.{subsubtype}'.format( type=type, subtype=subtype, subsubtype=subsubtype) full_type_cleaned = full_type.replace('.unspecified', '') propercased_full_type = type_mappings.get(full_type_cleaned, None) if propercased_full_type is None: logger.record_event( 'DEFAULT_CRITICAL_ERROR', 'propercased_full_type is None for full_type: {}'.format( full_type)) span_string = entry.get('span') keyframe_id = None keyframe_num = 0 if span_string == 'ENTIRE_DOCUMENT_ELEMENT': document_boundary = document_boundaries.get(modality).get( document_element_id) span_string = document_boundary.__str__() elif '-' in span_string: start, end = span_string.split('-') span_string = '({start},0)-({end},0)'.format(start=start, end=end) elif '_' in span_string: keyframe_id = span_string keyframe_num = span_string.split('_')[1] document_boundary = document_boundaries.get('keyframe').get( keyframe_id) span_string = document_boundary.__str__() else: span_string = None output_object = { 'document_id': document_id, 'document_element_id': document_element_id, 'keyframe_id': keyframe_id, 'keyframe_num': int(keyframe_num), 'modality': modality, 'region': span_string, 'type': propercased_full_type, } output.append(output_object) printed = {} fh = open(args.output, 'w') header = [ 'document_id', 'document_element_or_keyframe_id', 'modality', 'region', 'type' ] fh.write('{}\n'.format('\t'.join(header))) for output_object in multisort( output, (('document_id', False), ('modality', False), ('document_element_id', False), ('keyframe_num', False), ('region', False), ('type', False))): line = get_line(output_object, header) if line not in printed: fh.write('{}\n'.format(line)) printed[line] = 1 fh.close() exit(ALLOK_EXIT_CODE)