예제 #1
0
def validate_responses(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    logger.record_event('DEFAULT_INFO', 'validation started')
    document_mappings = DocumentMappings(
        logger, args.parent_children, Encodings(logger, args.encodings),
        CoreDocuments(logger, args.core_documents))
    text_boundaries = TextBoundaries(logger, args.sentence_boundaries)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries)
    keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries)
    document_boundaries = {
        'text': text_boundaries,
        'image': image_boundaries,
        'keyframe': keyframe_boundaries,
        'video': video_boundaries
    }

    queries = TA3QuerySet(logger, args.queries) if args.queries else None
    responses = ResponseSet(logger,
                            document_mappings,
                            document_boundaries,
                            args.input,
                            args.runid,
                            args.task,
                            queries=queries)
    responses.write_valid_responses(args.output)
    num_warnings, num_errors = logger.get_stats()
    closing_message = 'validation finished (warnings:{}, errors:{})'.format(
        num_warnings, num_errors)
    logger.record_event('DEFAULT_INFO', closing_message)
    print(closing_message)
    if num_errors > 0:
        exit(ERROR_EXIT_CODE)
    exit(ALLOK_EXIT_CODE)
예제 #2
0
def generate_confidence_intervals(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)
    if not args.input.endswith('.tab'):
        logger.record_event('DEFAULT_CRITICAL_ERROR',
                            'input filename should be a *.tab.')
    aggregate = {}
    for element in args.aggregate.split(','):
        key, value = element.split(':')
        if key not in aggregate:
            aggregate[key] = []
        aggregate[key].append(value)
    confidence_interval = ConfidenceIntervals(logger,
                                              macro=args.macro,
                                              input=args.input,
                                              primary_key_col=args.primary_key,
                                              score=args.score,
                                              aggregate=aggregate,
                                              document_id_col=args.document_id,
                                              run_id_col=args.run_id,
                                              sizes=args.sizes,
                                              seed_value=args.seed)
    output = {'pretty': args.pretty_output, 'tab': args.tab_output}
    for output_format in output:
        fh = open(output[output_format], 'w')
        fh.write(confidence_interval.get('output', output_format))
        fh.close()
    exit(ALLOK_EXIT_CODE)
예제 #3
0
def align_clusters(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    document_mappings = DocumentMappings(
        logger, args.parent_children, Encodings(logger, args.encodings),
        CoreDocuments(logger, args.core_documents))
    text_boundaries = TextBoundaries(logger, args.sentence_boundaries)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries)
    keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries)
    document_boundaries = {
        'text': text_boundaries,
        'image': image_boundaries,
        'keyframe': keyframe_boundaries,
        'video': video_boundaries
    }

    annotated_regions = AnnotatedRegions(logger, document_mappings,
                                         document_boundaries, args.regions)

    os.mkdir(args.similarities)
    os.mkdir(args.alignment)
    for entry in sorted(os.scandir(args.gold), key=str):
        if entry.is_dir() and entry.name.endswith('.ttl'):
            kb = entry.name
            message = 'aligning clusters in {}'.format(entry.name)
            logger.record_event('DEFAULT_INFO', message)
            print('At {}: {}'.format(
                time.strftime("%m/%d/%Y %H:%M:%S", time.localtime()), message))
            document_id = kb.replace('.ttl', '')

            gold_mentions = '{}/{}/AIDA_P2_TA1_CM_A0001.rq.tsv'.format(
                args.gold, kb)
            gold_edges = '{}/{}/AIDA_P2_TA1_AM_A0001.rq.tsv'.format(
                args.gold, kb)
            system_mentions = '{}/{}/AIDA_P2_TA1_CM_A0001.rq.tsv'.format(
                args.system, kb)
            system_edges = '{}/{}/AIDA_P2_TA1_AM_A0001.rq.tsv'.format(
                args.system, kb)

            gold_mentions = gold_mentions if os.path.exists(
                gold_mentions) else None
            gold_edges = gold_edges if os.path.exists(gold_edges) else None
            system_mentions = system_mentions if os.path.exists(
                system_mentions) else None
            system_edges = system_edges if os.path.exists(
                system_edges) else None

            similarities = '{}/{}.tab'.format(args.similarities, document_id)
            alignment = '{}/{}.tab'.format(args.alignment, document_id)
            check_for_paths_non_existance([similarities, alignment])
            clusters = Clusters(logger, document_mappings, document_boundaries,
                                annotated_regions, gold_mentions, gold_edges,
                                system_mentions, system_edges)
            clusters.print_similarities(similarities)
            clusters.print_alignment(alignment)
    exit(ALLOK_EXIT_CODE)
예제 #4
0
def filter_responses(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    ontology_type_mappings = OntologyTypeMappings(logger,
                                                  args.ontology_type_mappings)
    slot_mappings = SlotMappings(logger, args.slot_mappings)
    document_mappings = DocumentMappings(
        logger, args.parent_children, Encodings(logger, args.encodings),
        CoreDocuments(logger, args.core_documents))
    text_boundaries = TextBoundaries(logger, args.sentence_boundaries)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries)
    keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries)
    document_boundaries = {
        'text': text_boundaries,
        'image': image_boundaries,
        'keyframe': keyframe_boundaries,
        'video': video_boundaries
    }

    responses = ResponseSet(logger, ontology_type_mappings, slot_mappings,
                            document_mappings, document_boundaries, args.input,
                            args.runid)
    annotated_regions = AnnotatedRegions(logger, document_mappings,
                                         document_boundaries, args.regions)
    run_filter_on_all_responses(responses, annotated_regions,
                                document_mappings, document_boundaries)

    os.mkdir(args.output)
    for input_filename in responses:
        output_filename = input_filename.replace(responses.get('path'),
                                                 args.output)
        dirname = os.path.dirname(output_filename)
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        output_fh = open(output_filename, 'w')
        header_printed = False
        for linenum in sorted(responses.get(input_filename), key=int):
            entry = responses.get(input_filename).get(str(linenum))
            if not header_printed:
                output_fh.write('{}\n'.format(entry.get('header').get('line')))
                header_printed = True
            if not entry.get('valid'):
                logger.record_event('EXPECTING_VALID_ENTRY',
                                    entry.get('where'))
                continue
            if entry.get('passes_filter'):
                output_fh.write(entry.__str__())
        output_fh.close()
    exit(ALLOK_EXIT_CODE)
예제 #5
0
def main(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    type_mappings = Container(logger)
    for entry in FileHandler(logger, args.ontology_type_mappings):
        type_mappings.add(key=entry.get('full_type_ov'),
                          value=entry.get('full_type'))

    text_boundaries = TextBoundaries(logger, args.sentence_boundaries)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries)
    keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries)
    document_boundaries = {
        'text': text_boundaries,
        'image': image_boundaries,
        'keyframe': keyframe_boundaries,
        'video': video_boundaries
    }

    output = []
    for entry in FileHandler(logger, args.input):
        document_id = entry.get('root_doc_id')
        document_element_id = entry.get('doc_element_id')
        modality = entry.get('media_type')
        type = entry.get('type')
        subtype = entry.get('subtype')
        subsubtype = entry.get('subsubtype')
        # apply patch to correct LDC's mistake in annotation
        if type == 'personalsocial' and subtype == 'unspecified':
            subtype = 'relationship'
        full_type = '{type}.{subtype}.{subsubtype}'.format(
            type=type, subtype=subtype, subsubtype=subsubtype)
        full_type_cleaned = full_type.replace('.unspecified', '')
        propercased_full_type = type_mappings.get(full_type_cleaned, None)
        if propercased_full_type is None:
            logger.record_event(
                'DEFAULT_CRITICAL_ERROR',
                'propercased_full_type is None for full_type: {}'.format(
                    full_type))
        span_string = entry.get('span')
        keyframe_id = None
        keyframe_num = 0
        if span_string == 'ENTIRE_DOCUMENT_ELEMENT':
            document_boundary = document_boundaries.get(modality).get(
                document_element_id)
            span_string = document_boundary.__str__()
        elif '-' in span_string:
            start, end = span_string.split('-')
            span_string = '({start},0)-({end},0)'.format(start=start, end=end)
        elif '_' in span_string:
            keyframe_id = span_string
            keyframe_num = span_string.split('_')[1]
            document_boundary = document_boundaries.get('keyframe').get(
                keyframe_id)
            span_string = document_boundary.__str__()
        else:
            span_string = None
        output_object = {
            'document_id': document_id,
            'document_element_id': document_element_id,
            'keyframe_id': keyframe_id,
            'keyframe_num': int(keyframe_num),
            'modality': modality,
            'region': span_string,
            'type': propercased_full_type,
        }
        output.append(output_object)

    printed = {}
    fh = open(args.output, 'w')
    header = [
        'document_id', 'document_element_or_keyframe_id', 'modality', 'region',
        'type'
    ]
    fh.write('{}\n'.format('\t'.join(header)))
    for output_object in multisort(
            output, (('document_id', False), ('modality', False),
                     ('document_element_id', False), ('keyframe_num', False),
                     ('region', False), ('type', False))):
        line = get_line(output_object, header)
        if line not in printed:
            fh.write('{}\n'.format(line))
            printed[line] = 1
    fh.close()
    exit(ALLOK_EXIT_CODE)