Beispiel #1
0
def generate_confidence_intervals(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)
    if not args.input.endswith('.tab'):
        logger.record_event('DEFAULT_CRITICAL_ERROR',
                            'input filename should be a *.tab.')
    aggregate = {}
    for element in args.aggregate.split(','):
        key, value = element.split(':')
        if key not in aggregate:
            aggregate[key] = []
        aggregate[key].append(value)
    confidence_interval = ConfidenceIntervals(logger,
                                              macro=args.macro,
                                              input=args.input,
                                              primary_key_col=args.primary_key,
                                              score=args.score,
                                              aggregate=aggregate,
                                              document_id_col=args.document_id,
                                              run_id_col=args.run_id,
                                              sizes=args.sizes,
                                              seed_value=args.seed)
    output = {'pretty': args.pretty_output, 'tab': args.tab_output}
    for output_format in output:
        fh = open(output[output_format], 'w')
        fh.write(confidence_interval.get('output', output_format))
        fh.close()
    exit(ALLOK_EXIT_CODE)
Beispiel #2
0
def align_clusters(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    document_mappings = DocumentMappings(
        logger, args.parent_children, Encodings(logger, args.encodings),
        CoreDocuments(logger, args.core_documents))
    text_boundaries = TextBoundaries(logger, args.sentence_boundaries)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries)
    keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries)
    document_boundaries = {
        'text': text_boundaries,
        'image': image_boundaries,
        'keyframe': keyframe_boundaries,
        'video': video_boundaries
    }

    annotated_regions = AnnotatedRegions(logger, document_mappings,
                                         document_boundaries, args.regions)

    os.mkdir(args.similarities)
    os.mkdir(args.alignment)
    for entry in sorted(os.scandir(args.gold), key=str):
        if entry.is_dir() and entry.name.endswith('.ttl'):
            kb = entry.name
            message = 'aligning clusters in {}'.format(entry.name)
            logger.record_event('DEFAULT_INFO', message)
            print('At {}: {}'.format(
                time.strftime("%m/%d/%Y %H:%M:%S", time.localtime()), message))
            document_id = kb.replace('.ttl', '')

            gold_mentions = '{}/{}/AIDA_P2_TA1_CM_A0001.rq.tsv'.format(
                args.gold, kb)
            gold_edges = '{}/{}/AIDA_P2_TA1_AM_A0001.rq.tsv'.format(
                args.gold, kb)
            system_mentions = '{}/{}/AIDA_P2_TA1_CM_A0001.rq.tsv'.format(
                args.system, kb)
            system_edges = '{}/{}/AIDA_P2_TA1_AM_A0001.rq.tsv'.format(
                args.system, kb)

            gold_mentions = gold_mentions if os.path.exists(
                gold_mentions) else None
            gold_edges = gold_edges if os.path.exists(gold_edges) else None
            system_mentions = system_mentions if os.path.exists(
                system_mentions) else None
            system_edges = system_edges if os.path.exists(
                system_edges) else None

            similarities = '{}/{}.tab'.format(args.similarities, document_id)
            alignment = '{}/{}.tab'.format(args.alignment, document_id)
            check_for_paths_non_existance([similarities, alignment])
            clusters = Clusters(logger, document_mappings, document_boundaries,
                                annotated_regions, gold_mentions, gold_edges,
                                system_mentions, system_edges)
            clusters.print_similarities(similarities)
            clusters.print_alignment(alignment)
    exit(ALLOK_EXIT_CODE)
Beispiel #3
0
def filter_responses(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    ontology_type_mappings = OntologyTypeMappings(logger,
                                                  args.ontology_type_mappings)
    slot_mappings = SlotMappings(logger, args.slot_mappings)
    document_mappings = DocumentMappings(
        logger, args.parent_children, Encodings(logger, args.encodings),
        CoreDocuments(logger, args.core_documents))
    text_boundaries = TextBoundaries(logger, args.sentence_boundaries)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries)
    keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries)
    document_boundaries = {
        'text': text_boundaries,
        'image': image_boundaries,
        'keyframe': keyframe_boundaries,
        'video': video_boundaries
    }

    responses = ResponseSet(logger, ontology_type_mappings, slot_mappings,
                            document_mappings, document_boundaries, args.input,
                            args.runid)
    annotated_regions = AnnotatedRegions(logger, document_mappings,
                                         document_boundaries, args.regions)
    run_filter_on_all_responses(responses, annotated_regions,
                                document_mappings, document_boundaries)

    os.mkdir(args.output)
    for input_filename in responses:
        output_filename = input_filename.replace(responses.get('path'),
                                                 args.output)
        dirname = os.path.dirname(output_filename)
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        output_fh = open(output_filename, 'w')
        header_printed = False
        for linenum in sorted(responses.get(input_filename), key=int):
            entry = responses.get(input_filename).get(str(linenum))
            if not header_printed:
                output_fh.write('{}\n'.format(entry.get('header').get('line')))
                header_printed = True
            if not entry.get('valid'):
                logger.record_event('EXPECTING_VALID_ENTRY',
                                    entry.get('where'))
                continue
            if entry.get('passes_filter'):
                output_fh.write(entry.__str__())
        output_fh.close()
    exit(ALLOK_EXIT_CODE)
Beispiel #4
0
 def __init__(self, log, batch_id, kit_size, previous_pools,
              log_specifications, encodings, core_documents,
              parent_children, sentence_boundaries, image_boundaries,
              keyframe_boundaries, video_boundaries, runs_to_pool, queries,
              input_dir, output_dir):
     check_for_paths_existance([
         log_specifications, encodings, core_documents, parent_children,
         sentence_boundaries, image_boundaries, keyframe_boundaries,
         video_boundaries, runs_to_pool, queries, input_dir
     ])
     check_for_paths_non_existance(
         ['{}-{}'.format(output_dir, self.get('batch_id'))])
     self.log_filename = log
     self.batch_id = batch_id
     self.kit_size = kit_size
     self.previous_pools = previous_pools
     self.log_specifications = log_specifications
     self.encodings = encodings
     self.core_documents = core_documents
     self.parent_children = parent_children
     self.sentence_boundaries = sentence_boundaries
     self.image_boundaries = image_boundaries
     self.keyframe_boundaries = keyframe_boundaries
     self.video_boundaries = video_boundaries
     self.runs_to_pool = runs_to_pool
     self.queries = queries
     self.input = input_dir
     self.output = output_dir
     self.logger = Logger(self.get('log_filename'),
                          self.get('log_specifications'), sys.argv)
Beispiel #5
0
def main(args):
    """
    The main program for generating AIF
    """
    check_paths(args)
    logger = Logger(args.log, args.log_specifications_filename, sys.argv)
    core_documents = CoreDocuments(logger, args.core_documents_filename)
    encodings = Encodings(logger, args.encodings_filename)
    document_mappings = DocumentMappings(logger, args.parent_children_filename,
                                         encodings, core_documents)
    text_boundaries = TextBoundaries(logger, args.sentence_boundaries_filename)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries_filename)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries_filename)
    keyframe_boundaries = KeyFrameBoundaries(logger,
                                             args.keyframe_boundaries_filename)
    type_mappings = Container(logger)
    for entry in FileHandler(logger, args.type_mappings_filename):
        type_mappings.add(key=entry.get('full_type_ov'),
                          value=entry.get('full_type'))
    slot_mappings = SlotMappings(logger, args.slot_mappings_filename)
    annotations = Annotations(logger,
                              slot_mappings,
                              document_mappings,
                              text_boundaries,
                              image_boundaries,
                              video_boundaries,
                              keyframe_boundaries,
                              type_mappings,
                              args.annotations,
                              load_video_time_offsets_flag=args.notime)
    generator = AIFGenerator(logger, annotations, args.nochannel,
                             args.reference_kb_id)
    generator.write_output(args.output)
    exit(ALLOK_EXIT_CODE)
Beispiel #6
0
def score_submission(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    ontology_type_mappings = OntologyTypeMappings(logger, args.ontology_type_mappings)
    slot_mappings = SlotMappings(logger, args.slot_mappings)
    document_mappings = DocumentMappings(logger,
                                         args.parent_children,
                                         Encodings(logger, args.encodings),
                                         CoreDocuments(logger, args.core_documents))
    text_boundaries = TextBoundaries(logger, args.sentence_boundaries)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries)
    keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries)
    document_boundaries = {
        'text': text_boundaries,
        'image': image_boundaries,
        'keyframe': keyframe_boundaries,
        'video': video_boundaries
        }
    
    gold_responses = ResponseSet(logger, ontology_type_mappings, slot_mappings, document_mappings, document_boundaries, args.gold, 'gold')
    system_responses = ResponseSet(logger, ontology_type_mappings, slot_mappings, document_mappings, document_boundaries, args.system, args.runid)
    cluster_alignment = ClusterAlignment(logger, args.alignment)
    cluster_self_similarities = ClusterSelfSimilarities(logger, args.similarities)
    scores = ScoresManager(logger, gold_responses, system_responses, cluster_alignment, cluster_self_similarities, args.separator)
    scores.print_scores(args.scores)
    exit(ALLOK_EXIT_CODE)
Beispiel #7
0
def main(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    os.mkdir(args.sparql)

    columns = ['query_id', 'entrypoint_type', 'entrypoint', 'num_clusters', 'depth']

    queries_fh = open(args.queries, 'w')
    queries_fh.write('{}\n'.format('\t'.join(columns)))
    query_num = 0
    for entry in FileHandler(logger, args.input):
        query_num += 1
        values = {
            'depth'          : args.depth,
            'entrypoint_type': entry.get('entrypoint_type'),
            'entrypoint'     : entry.get('entrypoint'),
            'num_clusters'   : entry.get('num_clusters'),
            'query_id'       : '{prefix}{query_num}'.format(prefix=args.prefix, query_num=augment(query_num))
            }
        line = '\t'.join([values[column] for column in columns])
        queries_fh.write('{}\n'.format(line))
        
        sparql_query_fh = open('{dir}/{query_id}.rq'.format(dir=args.sparql, query_id=values['query_id']), 'w')
        sparql_query_fh.write(get_sparql(logger,
                                         values['query_id'],
                                         values['entrypoint_type'],
                                         values['entrypoint']))
        sparql_query_fh.close()
        
    queries_fh.close()

    exit(ALLOK_EXIT_CODE)
Beispiel #8
0
def main(args):
    check_if_path_exists(args)
    logger = Logger(args.log, args.log_specifications_filename, sys.argv)
    ontology = Ontology(logger, args.entities_ontology_filename,
                        args.relations_ontology_filename,
                        args.events_ontology_filename)
    mapping = {}
    for ere_container in [
            ontology.get('entities'),
            ontology.get('relations'),
            ontology.get('events')
    ]:
        for spec in ere_container.values():
            full_type = spec.get('cleaned_full_type')
            full_type_ov = spec.get('cleaned_full_type_ov')
            if full_type is None or full_type_ov is None:
                continue
            mapping[full_type_ov] = full_type

    program_output = open(args.output_filename, 'w')
    program_output.write('full_type_ov\tfull_type\n')
    for full_type_ov, full_type in mapping.items():
        program_output.write('{}\t{}\n'.format(full_type_ov, full_type))
    program_output.close()
    exit(ALLOK_EXIT_CODE)
Beispiel #9
0
def validate_responses(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    logger.record_event('DEFAULT_INFO', 'validation started')
    document_mappings = DocumentMappings(
        logger, args.parent_children, Encodings(logger, args.encodings),
        CoreDocuments(logger, args.core_documents))
    text_boundaries = TextBoundaries(logger, args.sentence_boundaries)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries)
    keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries)
    document_boundaries = {
        'text': text_boundaries,
        'image': image_boundaries,
        'keyframe': keyframe_boundaries,
        'video': video_boundaries
    }

    queries = TA3QuerySet(logger, args.queries) if args.queries else None
    responses = ResponseSet(logger,
                            document_mappings,
                            document_boundaries,
                            args.input,
                            args.runid,
                            args.task,
                            queries=queries)
    responses.write_valid_responses(args.output)
    num_warnings, num_errors = logger.get_stats()
    closing_message = 'validation finished (warnings:{}, errors:{})'.format(
        num_warnings, num_errors)
    logger.record_event('DEFAULT_INFO', closing_message)
    print(closing_message)
    if num_errors > 0:
        exit(ERROR_EXIT_CODE)
    exit(ALLOK_EXIT_CODE)
 def __init__(self, log, log_specifications, queries, output):
     check_for_paths_existance([log_specifications, queries, output])
     check_for_paths_non_existance([])
     self.log_filename = log
     self.log_specifications = log_specifications
     self.queries = queries
     self.output = output
     self.logger = Logger(self.get('log_filename'),
                          self.get('log_specifications'), sys.argv)
Beispiel #11
0
 def __init__(self, log_filename, log_specifications, task, input_dir,
              output_dir):
     check_for_paths_existance([log_specifications, input_dir])
     check_for_paths_non_existance([output_dir])
     self.log_filename = log_filename
     self.log_specifications = log_specifications
     self.task = task
     self.input_dir = input_dir
     self.output_dir = output_dir
     self.logger = Logger(self.get('log_filename'),
                          self.get('log_specifications'), sys.argv)
Beispiel #12
0
 def __init__(self, log, batch_id, previous_pools, log_specifications,
              queries_to_pool, runs_to_pool, input_dir, output_dir):
     check_for_paths_existance(
         [log_specifications, runs_to_pool, queries_to_pool, input_dir])
     check_for_paths_non_existance(['{}-{}'.format(output_dir, batch_id)])
     self.log_filename = log
     self.batch_id = batch_id
     self.previous_pools = previous_pools
     self.log_specifications = log_specifications
     self.runs_to_pool = runs_to_pool
     self.queries_to_pool = queries_to_pool
     self.input = input_dir
     self.output = output_dir
     self.logger = Logger(self.get('log_filename'),
                          self.get('log_specifications'), sys.argv)
Beispiel #13
0
def clean_sparql_output(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)
    filenames = []
    for root, dirs, files in os.walk(args.input):
        filenames.extend([
            os.path.join(root, file) for file in files if file.endswith('.tsv')
        ])
    os.mkdir(args.output)
    for input_filename in filenames:
        output_root = args.output
        output_basename = os.path.basename(input_filename)
        output_subdir = input_filename.replace(args.input, '').replace(
            output_basename, '').rstrip('/').lstrip('/')
        output_directory = '{}/{}'.format(output_root, output_subdir)
        output_filename = '{}/{}'.format(output_directory, output_basename)
        os.makedirs(output_directory, exist_ok=True)
        clean_a_sparql_output_file(logger, input_filename, output_filename)
    exit(ALLOK_EXIT_CODE)
Beispiel #14
0
def validate_responses(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    ontology_type_mappings = OntologyTypeMappings(logger, args.ontology_type_mappings)
    slot_mappings = SlotMappings(logger, args.slot_mappings)
    document_mappings = DocumentMappings(logger,
                                         args.parent_children,
                                         Encodings(logger, args.encodings),
                                         CoreDocuments(logger, args.core_documents))
    text_boundaries = TextBoundaries(logger, args.sentence_boundaries)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries)
    keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries)
    document_boundaries = {
        'text': text_boundaries,
        'image': image_boundaries,
        'keyframe': keyframe_boundaries,
        'video': video_boundaries
        }

    responses = ResponseSet(logger, ontology_type_mappings, slot_mappings, document_mappings, document_boundaries, args.input, args.runid)
    responses.write_valid_responses(args.output)
    exit(ALLOK_EXIT_CODE)
Beispiel #15
0
def main(args):
    logger = Logger(args.log, args.log_specifications, sys.argv)

    type_mappings = Container(logger)
    for entry in FileHandler(logger, args.ontology_type_mappings):
        type_mappings.add(key=entry.get('full_type_ov'),
                          value=entry.get('full_type'))

    text_boundaries = TextBoundaries(logger, args.sentence_boundaries)
    image_boundaries = ImageBoundaries(logger, args.image_boundaries)
    video_boundaries = VideoBoundaries(logger, args.video_boundaries)
    keyframe_boundaries = KeyFrameBoundaries(logger, args.keyframe_boundaries)
    document_boundaries = {
        'text': text_boundaries,
        'image': image_boundaries,
        'keyframe': keyframe_boundaries,
        'video': video_boundaries
    }

    output = []
    for entry in FileHandler(logger, args.input):
        document_id = entry.get('root_doc_id')
        document_element_id = entry.get('doc_element_id')
        modality = entry.get('media_type')
        type = entry.get('type')
        subtype = entry.get('subtype')
        subsubtype = entry.get('subsubtype')
        full_type = '{type}.{subtype}.{subsubtype}'.format(
            type=type, subtype=subtype, subsubtype=subsubtype)
        full_type_cleaned = full_type.replace('.unspecified', '')
        propercased_full_type = type_mappings.get(full_type_cleaned, None)
        span_string = entry.get('span')
        keyframe_id = None
        keyframe_num = 0
        if span_string == 'ENTIRE_DOCUMENT_ELEMENT':
            document_boundary = document_boundaries.get(modality).get(
                document_element_id)
            span_string = document_boundary.__str__()
        elif '-' in span_string:
            start, end = span_string.split('-')
            span_string = '({start},0)-({end},0)'.format(start=start, end=end)
        elif '_' in span_string:
            keyframe_id = span_string
            keyframe_num = span_string.split('_')[1]
            document_boundary = document_boundaries.get('keyframe').get(
                keyframe_id)
            span_string = document_boundary.__str__()
        else:
            span_string = None
        output_object = {
            'document_id': document_id,
            'document_element_id': document_element_id,
            'keyframe_id': keyframe_id,
            'keyframe_num': int(keyframe_num),
            'modality': modality,
            'region': span_string,
            'type': propercased_full_type,
        }
        output.append(output_object)

    printed = {}
    fh = open(args.output, 'w')
    header = [
        'document_id', 'document_element_or_keyframe_id', 'modality', 'region',
        'type'
    ]
    fh.write('{}\n'.format('\t'.join(header)))
    for output_object in multisort(
            output, (('document_id', False), ('modality', False),
                     ('document_element_id', False), ('keyframe_num', False),
                     ('region', False), ('type', False))):
        line = get_line(output_object, header)
        if line not in printed:
            fh.write('{}\n'.format(line))
            printed[line] = 1
    fh.close()
    exit(ALLOK_EXIT_CODE)