Ejemplo n.º 1
0
def compare(path,doc):
    convert(path,doc)
    ann = Annotations(path+doc)
    fdoc = folia.Document(file=path+doc+".xml")
    #test entities
    for ent in ann.get_textbounds():
        try:
            found=fdoc[ent.id]
            text = [str(a) for a in found.wrefs()]
            if ent.tail.strip() != " ".join(text):
                print "error: not found entity"
                print ent
                return False
        except KeyError:
            print "error: not found entity"
            print ent
            return False
    #test relations
    for rel in ann.get_relations():
        try:
            found=fdoc[rel.id]
            arefs = found.select(folia.AlignReference)
            if  not (any(a.id == rel.arg1 for a in arefs) and any(a.id == rel.arg2 for a in arefs)):
                print "error: not found relation"
                print rel
                return False
        except KeyError:
            print "error: not found relation"
            print rel
            return False
    #test events
    for event in ann.get_events():
        try:
            found=fdoc[event.id]
            arefs = found.select(folia.AlignReference)
            for role,rid in event.args:
                if  not any(a.id == rid for a in arefs) :
                    print "error: not found relation"
                    print rel
                    return False
        except KeyError:
            print "error: not found relation"
            print rel
            return False
    #test attributes
    for attr in ann.get_attributes():
        try:
            found=fdoc[attr.target]
            if  not any(fattr.cls == str(attr.value) and fattr.subset == attr.type for fattr in found.select(folia.Feature)) :
                print "error: not found attr"
                print attr
                print
                return False
        except KeyError:
            print "error: not found attr"
            print rel
            return False

    print "file "+path+doc+" is OK"
    return True
Ejemplo n.º 2
0
def convert(path,doc):
    #path is path to the file without extension
    projectconf = ProjectConfiguration(path)
    path = path_join(path,doc)
    ann = Annotations(path+".ann")
    doc = build_text_structure(ann,path+".txt")
    add_relations(doc,ann)
    add_comments(doc,ann)
    #~ ent_set=xml(build_entity_set(doc))
    #~ rel_set=xml(build_relations_set(doc))
    #~ temp=open ("entiteit_set.xml",'w')
    #~ temp.write(ent_set)
    #~ temp.close()
    #~ rel=open ("relation_set.xml",'w')
    #~ rel.write(rel_set)
    #~ rel.close()
    doc.save(path+".xml")
Ejemplo n.º 3
0
def convert(data, src):
    # Fail early if we don't have a converter
    try:
        conv_text, conv_ann = CONV_BY_SRC[src]
    except KeyError:
        raise InvalidSrcFormat

    # Note: Due to a lack of refactoring we need to write to disk to read
    #   annotions, once this is fixed, the below code needs some clean-up
    tmp_dir = None
    try:
        tmp_dir = mkdtemp()
        doc_base = path_join(tmp_dir, 'tmp')
        with open_textfile(doc_base + '.txt', 'w') as txt_file:
            txt_file.write(conv_text(data))
        with open(doc_base + '.ann', 'w'):
            pass

        with Annotations(doc_base) as ann_obj:
            for ann in conv_ann(data):
                ann_obj.add_annotation(ann)

        json_dic = _document_json_dict(doc_base)
        # Note: Blank the comments, they rarely do anything good but whine
        #   about configuration when we use the tool solely for visualisation
        #   purposes
        json_dic['comments'] = []

        # Note: This is an ugly hack... we want to ride along with the
        #   Stanford tokenisation and sentence splits when returning their
        #   output rather than relying on the ones generated by brat.
        if src.startswith('stanford-'):
            json_dic['token_offsets'] = stanford_token_offsets(data)
            json_dic['sentence_offsets'] = stanford_sentence_offsets(data)

        return json_dic
    finally:
        if tmp_dir is not None:
            rmtree(tmp_dir)
Ejemplo n.º 4
0
def main(args):
    # Imports are here so we don't need to wait for them to load unecessarily.
    import os
    import tkinter

    from video import Video
    from annotation import Annotations
    from state import State
    import gui

    # Parameters
    video_file_path = args.video_file_path
    annotation_file_path = args.annotation_file_path

    if annotation_file_path is None:
        # Expect the following dir structure:
        # dataset/
        # - videos/
        # - annotations/
        split_path = os.path.split(video_file_path)
        annotation_file_name = split_path[-1].split('.')[0] + '.pkl'
        annotation_file_dir = list(split_path[:-1]) + ['..', 'annotations']
        annotation_file_dir = os.path.join(*annotation_file_dir)
        if not os.path.isdir(annotation_file_dir):
            print('Invalid directory structure.')
            return
        annotation_file_path = os.path.join(annotation_file_dir,
                                            annotation_file_name)

    # Load Video
    video = Video(video_file_path)
    annotations = Annotations(annotation_file_path, video)
    state = State(video, annotations)

    # Create GUI
    gui.App(tkinter.Tk(), state)

    # When everything done, release the video capture object
    video.close()
Ejemplo n.º 5
0
        generate = True

    # "header" and types
    stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")]

    if options_get_validation(directory) != 'none':
        stat_types.append(("Issues", "int"))

    if generate:
        # Generate the document statistics from scratch
        from annotation import JOINED_ANN_FILE_SUFF
        log_info('generating statistics for "%s"' % directory)
        docstats = []
        for docname in base_names:
            try:
                with Annotations(path_join(directory, docname),
                                 read_only=True) as ann_obj:
                    tb_count = len([a for a in ann_obj.get_entities()])
                    rel_count = (len([a for a in ann_obj.get_relations()]) +
                                 len([a for a in ann_obj.get_equivs()]))
                    event_count = len([a for a in ann_obj.get_events()])

                    if options_get_validation(directory) == 'none':
                        docstats.append([tb_count, rel_count, event_count])
                    else:
                        # verify and include verification issue count
                        try:
                            from projectconfig import ProjectConfiguration
                            projectconf = ProjectConfiguration(directory)
                            from verify_annotations import verify_annotation
                            issues = verify_annotation(ann_obj, projectconf)
                            issue_count = len(issues)
Ejemplo n.º 6
0
def get_statistics(directory, base_names, use_cache=True):
    # Check if we have a cache of the costly satistics generation
    # Also, only use it if no file is newer than the cache itself
    cache_file_path = get_stat_cache_by_dir(directory)

    try:
        cache_mtime = getmtime(cache_file_path)
    except OSError as e:
        if e.errno == 2:
            cache_mtime = -1
        else:
            raise

    try:
        if (not isfile(cache_file_path)
                # Has config.py been changed?
                or getmtime(get_config_py_path()) > cache_mtime
                # Any file has changed in the dir since the cache was generated
                or any(True for f in listdir(directory)
                       if (getmtime(path_join(directory, f)) > cache_mtime
                           # Ignore hidden files
                           and not f.startswith('.')))
                # The configuration is newer than the cache
                or getmtime(get_config_path(directory)) > cache_mtime):
            generate = True
            docstats = []
        else:
            generate = False
            try:
                with open(cache_file_path, 'rb') as cache_file:
                    docstats = pickle_load(cache_file)
                if len(docstats) != len(base_names):
                    Messager.warning(
                        'Stats cache %s was incomplete; regenerating' %
                        cache_file_path)
                    generate = True
                    docstats = []
            except UnpicklingError:
                # Corrupt data, re-generate
                Messager.warning(
                    'Stats cache %s was corrupted; regenerating' %
                    cache_file_path, -1)
                generate = True
            except EOFError:
                # Corrupt data, re-generate
                generate = True
    except OSError as e:
        Messager.warning(
            'Failed checking file modification times for stats cache check; regenerating'
        )
        generate = True

    if not use_cache:
        generate = True

    # "header" and types
    stat_types = [("实体", "int"), ("关系", "int"), ("事件", "int")]

    if options_get_validation(directory) != 'none':
        stat_types.append(("观点", "int"))

    stat_types.append(("修改者", "string"))
    if generate:
        # Generate the document statistics from scratch
        from annotation import JOINED_ANN_FILE_SUFF
        log_info('generating statistics for "%s"' % directory)
        docstats = []
        for docname in base_names:
            try:
                # 在这里获取实体,关系,事件,修改者。
                with Annotations(path_join(directory, docname),
                                 read_only=True) as ann_obj:
                    tb_count = len([a for a in ann_obj.get_entities()])
                    rel_count = (len([a for a in ann_obj.get_relations()]) +
                                 len([a for a in ann_obj.get_equivs()]))
                    event_count = len([a for a in ann_obj.get_events()])

                    try:
                        user = get_session().get('user')
                    except KeyError:
                        user = None

                    if user is None:
                        user = '******'

                    if options_get_validation(directory) == 'none':
                        docstats.append(
                            [tb_count, rel_count, event_count, user])
                    else:
                        # verify and include verification issue count
                        try:
                            from projectconfig import ProjectConfiguration
                            projectconf = ProjectConfiguration(directory)
                            from verify_annotations import verify_annotation
                            issues = verify_annotation(ann_obj, projectconf)
                            issue_count = len(issues)
                        except BaseException:
                            # TODO: error reporting
                            issue_count = -1
                        docstats.append([
                            tb_count, rel_count, event_count, issue_count, user
                        ])
            except Exception as e:
                log_info('Received "%s" when trying to generate stats' % e)
                # Pass exceptions silently, just marking stats missing
                docstats.append([-1] * len(stat_types))

        try:
            user = get_session().get('user')
        except KeyError:
            user = None
        if user is None:
            user = '******'
        # Cache the statistics
        try:
            with open(cache_file_path, 'wb') as cache_file:
                pickle_dump(docstats, cache_file)
        except IOError as e:
            Messager.warning(
                "Could not write statistics cache file to directory %s: %s" %
                (directory, e))

    return stat_types, docstats
Ejemplo n.º 7
0
        full_name = temp_paths[1].replace("/", "")
        temp = open(app_path + full_name, 'wb')
        pickle_dump(sann, temp)
        temp.close()
    except Exception as e:
        Messager.error("Error while caching changes in the annotation file: " +
                       str(e))


def update_dump(j_dic, file_path):
    app_path = WORK_DIR + "/application/"
    temp_paths = file_path.split("/data/")
    try:
        full_name = temp_paths[1].replace("/", "")
        temp = open(app_path + full_name, 'wb')
        pickle_dump(j_dic, temp)
        temp.close()
    except Exception as e:
        Messager.error("Error while caching changes in the annotation file: " +
                       str(e))


if __name__ == '__main__':
    millis = int(round(time.time() * 1000))
    print millis
    ann = Annotations("/home/sander/Documents/Masterproef/brat/data/test")
    sann = SimpleAnnotations(ann)
    print filter_folia(sann)
    millis = int(round(time.time() * 1000)) - millis
    print millis
Ejemplo n.º 8
0
                    temp.append(i)
                    ids.add(i)
        if isinstance(temp_ann, BinaryRelationAnnotation):
            for i in temp_ann.get_deps()[1]:
                if not i in ids:
                    temp.append(i)
                    ids.add(i)
    for i in temp:
        temp_ann = ann.get_ann_by_id(i)
        if isinstance(temp_ann, TextBoundAnnotation):
            recursive_ann(temp_ann, con, ids, ann)


if __name__ == "__main__":
    from annotation import TextBoundAnnotation, TextAnnotations, EventAnnotation, BinaryRelationAnnotation
    proj = ProjectConfiguration(
        "/home/sander/Documents/Masterproef/brat/data/brat_vb/sentiment")
    ann = Annotations(
        "/home/sander/Documents/Masterproef/brat/data/brat_vb/sentiment/sporza"
    )

    #SPEED TEST
    import time
    millis = int(round(time.time() * 1000))
    print millis
    vrules = ValidationRules(proj)
    for i in vrules.validate(ann)[0]:
        print str(i)
    millis = int(round(time.time() * 1000)) - millis
    print millis