def correct_annotations(orig_fn, ann_fn, change_fn):
    with annotation.TextAnnotations(ann_fn) as anns:
        orig_text = anns.get_document_text()
        with annotation.open_textfile(change_fn, 'r') as f:
            changed_text = f.read()
        diffs = diff_match_patch().diff_main(orig_text, changed_text)
        orig_offset = 0
        change_offset = 0
        offsets = []
        for diff in diffs:
            kind = diff[0]
            text = diff[1]
            size = len(text)
            delta = size * kind
            offsets.append((orig_offset, delta))
            if kind != 1:
                orig_offset += size
        offsets = offsets[::-1]        
        tbs = list(anns.get_textbounds())
        indices = []
        for tbi, tb in enumerate(tbs):
            for spani, span in enumerate(tb.spans):
                indices.append((span[0], tbi, spani, 0))
                indices.append((span[1], tbi, spani, 1))
        indices.sort(reverse=True)
        for orig_offset, delta in offsets:
            for index in indices:
                if index[0] < orig_offset: break
                frag = list(tbs[index[1]].spans[index[2]])
                frag[index[3]] += delta
                tbs[index[1]].spans[index[2]] = tuple(frag)
        for tb in tbs:
            if isinstance(tb, annotation.TextBoundAnnotationWithText):
                tb.text = annotation.DISCONT_SEP.join((changed_text[start:end] for start, end in tb.spans))
    copy(change_fn, orig_fn)
def download_file(document, collection, extension):
    directory = collection
    real_dir = real_directory(directory)
    fname = '%s.%s' % (document, extension)
    fpath = path_join(real_dir, fname)
    #hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)]
    hdrs = [('Content-Type', 'application/octet-stream'), ('Content-Disposition', 'inline; filename=%s' % fname)]
    if allowed_to_read(fpath):
        if not exists(fpath):
            data = ""
            if extension == "zip":
                import zipfile
                zipf = zipfile.ZipFile(fpath, 'w')
                zipf.close()
                with open(fpath, 'rb') as txt_file:
                    data = txt_file.read()
        else:
            if extension != "zip":
                with open_textfile(fpath, 'r') as txt_file:
                    data = txt_file.read().encode('utf-8')
            else:
                with open(fpath, 'rb') as txt_file:
                    data = txt_file.read()
    else:
        data = "Access Denied"
    raise NoPrintJSONError(hdrs, data)
Example #3
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, "r") as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error("Error reading text file: nonstandard encoding or binary?", -1)
            raise UnableToReadTextFile(txt_file_path)

    j_dic["text"] = text

    from logging import info as log_info

    tokeniser = options_get_tokenization(dirname(txt_file_path))

    # First, generate tokenisation
    if tokeniser == "mecab":
        from tokenise import jp_token_boundary_gen

        tok_offset_gen = jp_token_boundary_gen
    elif tokeniser == "whitespace":
        from tokenise import whitespace_token_boundary_gen

        tok_offset_gen = whitespace_token_boundary_gen
    elif tokeniser == "ptblike":
        from tokenise import gtb_token_boundary_gen

        tok_offset_gen = gtb_token_boundary_gen
    else:
        Messager.warning("Unrecognized tokenisation option " ", reverting to whitespace tokenisation.")
        from tokenise import whitespace_token_boundary_gen

        tok_offset_gen = whitespace_token_boundary_gen
    j_dic["token_offsets"] = [o for o in tok_offset_gen(text)]

    ssplitter = options_get_ssplitter(dirname(txt_file_path))
    if ssplitter == "newline":
        from ssplit import newline_sentence_boundary_gen

        ss_offset_gen = newline_sentence_boundary_gen
    elif ssplitter == "regex":
        from ssplit import regex_sentence_boundary_gen

        ss_offset_gen = regex_sentence_boundary_gen
    else:
        Messager.warning("Unrecognized sentence splitting option " ", reverting to newline sentence splitting.")
        from ssplit import newline_sentence_boundary_gen

        ss_offset_gen = newline_sentence_boundary_gen
    j_dic["sentence_offsets"] = [o for o in ss_offset_gen(text)]

    return True
Example #4
0
def __read_or_default(filename, default):
    try:
        f = open_textfile(filename, 'r')
        r = f.read()
        f.close()
        return r
    except:
        # TODO: specific exception handling and reporting
        return default
Example #5
0
def __read_or_default(filename, default):
    try:
        f = open_textfile(filename, 'r')
        r = f.read()
        f.close()
        return r
    except:
        # TODO: specific exception handling and reporting
        return default
Example #6
0
def download_file(document, collection, extension):
    directory = collection
    real_dir = real_directory(directory)
    fname = "%s.%s" % (document, extension)
    fpath = path_join(real_dir, fname)

    hdrs = [("Content-Type", "text/plain; charset=utf-8"), ("Content-Disposition", "inline; filename=%s" % fname)]
    with open_textfile(fpath, "r") as txt_file:
        data = txt_file.read().encode("utf-8")
    raise NoPrintJSONError(hdrs, data)
Example #7
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error(
                'Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    j_dic['text'] = text

    from logging import info as log_info

    tokeniser = options_get_tokenization(dirname(txt_file_path))

    # First, generate tokenisation
    if tokeniser == 'mecab':
        from tokenise import jp_token_boundary_gen
        tok_offset_gen = jp_token_boundary_gen
    elif tokeniser == 'whitespace':
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    elif tokeniser == 'ptblike':
        from tokenise import gtb_token_boundary_gen
        tok_offset_gen = gtb_token_boundary_gen
    else:
        Messager.warning('Unrecognized tokenisation option '
                         ', reverting to whitespace tokenisation.')
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    j_dic['token_offsets'] = [o for o in tok_offset_gen(text)]

    ssplitter = options_get_ssplitter(dirname(txt_file_path))
    if ssplitter == 'newline':
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    elif ssplitter == 'regex':
        from ssplit import regex_sentence_boundary_gen
        ss_offset_gen = regex_sentence_boundary_gen
    else:
        Messager.warning('Unrecognized sentence splitting option '
                         ', reverting to newline sentence splitting.')
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)]

    return True
Example #8
0
def download_file(document, collection, extension):
    directory = collection
    real_dir = real_directory(directory)
    fname = '%s.%s' % (document, extension)
    fpath = path_join(real_dir, fname)

    hdrs = [('Content-Type', 'text/plain; charset=utf-8'),
            ('Content-Disposition', 'inline; filename=%s' % fname)]
    with open_textfile(fpath, 'r') as txt_file:
        data = txt_file.read().encode('utf-8')
    raise NoPrintJSONError(hdrs, data)
Example #9
0
def download_file(document, collection, extension):
    directory = collection
    real_dir = real_directory(directory)
    fname = '%s.%s' % (document, extension)
    fpath = path_join(real_dir, fname)

    hdrs = [('Content-Type', 'text/plain; charset=utf-8'),
            ('Content-Disposition',
                'inline; filename=%s' % fname)]
    with open_textfile(fpath, 'r') as txt_file:
        data = txt_file.read()
    raise NoPrintJSONError(hdrs, data)
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error('Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    j_dic['text'] = text

    tokeniser = options_get_tokenization(dirname(txt_file_path))

    # First, generate tokenisation
    if tokeniser == 'mecab':
        from tokenise import jp_token_boundary_gen
        tok_offset_gen = jp_token_boundary_gen
    elif tokeniser == 'whitespace':
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    elif tokeniser == 'ptblike':
        from tokenise import gtb_token_boundary_gen
        tok_offset_gen = gtb_token_boundary_gen
    else:
        Messager.warning('Unrecognized tokenisation option '
                ', reverting to whitespace tokenisation.')
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    j_dic['token_offsets'] = [o for o in tok_offset_gen(text)]

    ssplitter = options_get_ssplitter(dirname(txt_file_path))
    if ssplitter == 'newline':
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    elif ssplitter == 'regex':
        from ssplit import regex_sentence_boundary_gen
        ss_offset_gen = regex_sentence_boundary_gen
    else:
        Messager.warning('Unrecognized sentence splitting option '
                ', reverting to newline sentence splitting.')
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)]

    return True
Example #11
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error(
                'Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    # TODO XXX huge hack, sorry, the client currently crashing on
    # chrome for two or more consecutive space, so replace every
    # second with literal non-breaking space. Note that this is just
    # for the client display -- server-side storage is not affected.
    # NOTE: it might be possible to fix this in a principled way by
    # having xml:space="preserve" on the relevant elements.
    text = text.replace("  ", ' ' + unichr(0x00A0))

    j_dic['text'] = text

    from logging import info as log_info

    # First, generate tokenisation
    if JAPANESE:
        from tokenise import jp_token_boundary_gen
        token_offsets = [o for o in jp_token_boundary_gen(text)]
    else:
        from tokenise import en_token_boundary_gen
        token_offsets = [o for o in en_token_boundary_gen(text)]
    j_dic['token_offsets'] = token_offsets

    if NEWLINE_SS:
        from ssplit import newline_sentence_boundary_gen
        sentence_offsets = [o for o in newline_sentence_boundary_gen(text)]
    elif JAPANESE:
        from ssplit import jp_sentence_boundary_gen
        sentence_offsets = [o for o in jp_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(offsets))
    else:
        from ssplit import en_sentence_boundary_gen
        sentence_offsets = [o for o in en_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(sentence_offsets))
    j_dic['sentence_offsets'] = sentence_offsets

    return True
Example #12
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error('Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    # TODO XXX huge hack, sorry, the client currently crashing on
    # chrome for two or more consecutive space, so replace every
    # second with literal non-breaking space. Note that this is just
    # for the client display -- server-side storage is not affected.
    # NOTE: it might be possible to fix this in a principled way by
    # having xml:space="preserve" on the relevant elements.
    text = text.replace("  ", ' '+unichr(0x00A0))

    j_dic['text'] = text
    
    from logging import info as log_info

    # First, generate tokenisation
    if JAPANESE:
        from tokenise import jp_token_boundary_gen
        token_offsets = [o for o in jp_token_boundary_gen(text)]
    else:
        from tokenise import en_token_boundary_gen
        token_offsets = [o for o in en_token_boundary_gen(text)]
    j_dic['token_offsets'] = token_offsets

    if NEWLINE_SS:
        from ssplit import newline_sentence_boundary_gen
        sentence_offsets = [o for o in newline_sentence_boundary_gen(text)]
    elif JAPANESE:
        from ssplit import jp_sentence_boundary_gen
        sentence_offsets = [o for o in jp_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(offsets))
    else:
        from ssplit import en_sentence_boundary_gen
        sentence_offsets = [o for o in en_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(sentence_offsets))
    j_dic['sentence_offsets'] = sentence_offsets

    return True
Example #13
0
def __create_span(ann_obj, mods, type, start, end, txt_file_path,
        projectconf, attributes):
    # TODO: Rip this out!
    start = int(start)
    end = int(end)

    # Before we add a new trigger, does it already exist?
    found = None
    for tb_ann in ann_obj.get_textbounds():
        try:
            if (tb_ann.start == start and tb_ann.end == end
                    and tb_ann.type == type):
                found = tb_ann
                break
        except AttributeError:
            # Not a trigger then
            pass

    if found is None:
        # Get a new ID
        new_id = ann_obj.get_new_id('T') #XXX: Cons
        # Get the text span
        with open_textfile(txt_file_path, 'r') as txt_file:
            text = txt_file.read()[start:end]

        #TODO: Data tail should be optional
        if '\n' not in text:
            ann = TextBoundAnnotationWithText(start, end, new_id, type, text)
            ann_obj.add_annotation(ann)
            mods.addition(ann)
        else:
            ann = None
    else:
        ann = found

    if ann is not None:
        if projectconf.is_physical_entity_type(type):
            # TODO: alert that negation / speculation are ignored if set
            event = None
        else:
            # Create the event also
            new_event_id = ann_obj.get_new_id('E') #XXX: Cons
            event = EventAnnotation(ann.id, [], unicode(new_event_id), type, '')
            ann_obj.add_annotation(event)
            mods.addition(event)
    else:
        # We got a newline in the span, don't take any action
        event = None

    return ann, event
Example #14
0
def save_import(title, text, docid, collection=None):
    '''
    TODO: DOC:
    '''

    directory = collection

    if directory is None:
        dir_path = DATA_DIR
    else:
        #XXX: These "security" measures can surely be fooled
        if (directory.count('../') or directory == '..'):
            raise InvalidDirError(directory)

        dir_path = real_directory(directory)

    # Is the directory a directory and are we allowed to write?
    if not isdir(dir_path):
        raise InvalidDirError(dir_path)
    if not access(dir_path, W_OK):
        raise NoWritePermissionError(dir_path)

    base_path = join_path(dir_path, docid)
    txt_path = base_path + '.' + TEXT_FILE_SUFFIX
    ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF
    dat_path = base_path + '.' + DATA_FILE_SUFFIX

    # Before we proceed, verify that we are not overwriting
    for path in (txt_path, ann_path):
        if isfile(path):
            raise FileExistsError(path)

    # Make sure we have a valid POSIX text file, i.e. that the
    # file ends in a newline.
    if text != "" and text[-1] != '\n':
        text = text + '\n'

    with open_textfile(txt_path, 'w') as txt_file:
        txt_file.write(title + '\n' + text)

    # Touch the ann file so that we can edit the file later
    with open(ann_path, 'w') as _:
        pass

    # Touch the dat file so that we can edit the file later
    with open(dat_path, 'w') as _:
        pass

    return { 'document': docid }
Example #15
0
def __create_span(ann_obj, mods, type, offsets, txt_file_path,
        projectconf, attributes):
    # Before we add a new trigger, does an equivalent one already exist?
    found = None
    for tb_ann in ann_obj.get_textbounds():
        try:
            if _offsets_equal(tb_ann.spans, offsets) and tb_ann.type == type:
                found = tb_ann
                break
        except AttributeError:
            # Not a trigger then
            pass

    if found is None:
        # Get a new ID
        new_id = ann_obj.get_new_id('T') #XXX: Cons
        # Get the text span
        with open_textfile(txt_file_path, 'r') as txt_file:
            # TODO discont: use offsets instead (note need for int conversion)
            text = _text_for_offsets(txt_file.read(), offsets)

        #TODO: Data tail should be optional
        if '\n' not in text:
            ann = TextBoundAnnotationWithText(offsets[:], new_id, type, text)
            ann_obj.add_annotation(ann)
            mods.addition(ann)
        else:
            ann = None
    else:
        ann = found

    if ann is not None:
        if projectconf.is_physical_entity_type(type):
            # TODO: alert that negation / speculation are ignored if set
            event = None
        else:
            # Create the event also
            new_event_id = ann_obj.get_new_id('E') #XXX: Cons
            event = EventAnnotation(ann.id, [], unicode(new_event_id), type, '')
            ann_obj.add_annotation(event)
            mods.addition(event)
    else:
        # We got a newline in the span, don't take any action
        event = None

    return ann, event
Example #16
0
def download_file(document, collection, extension):
    directory = collection
    real_dir = real_directory(directory)
    fname = '%s.%s' % (document, extension)
    fpath = path_join(real_dir, fname)

    hdrs = [('Content-Type', 'text/plain; charset=utf-8'),
            ('Content-Disposition', 'inline; filename=%s' % fname)]

    #Folia conversion added by Sander Naert
    from brat2folia import convert
    if extension == 'xml':
        convert(real_dir, document)
        #convert to folia

    with open_textfile(fpath, 'r') as txt_file:
        data = txt_file.read().encode('utf-8')
    raise NoPrintJSONError(hdrs, data)
Example #17
0
def save_import(title, text, docid, collection=None):
    '''
    TODO: DOC:
    '''

    directory = collection

    if directory is None:
        dir_path = DATA_DIR
    else:
        #XXX: These "security" measures can surely be fooled
        if (directory.count('../') or directory == '..'):
            raise InvalidDirError(directory)

        dir_path = real_directory(directory)

    # Is the directory a directory and are we allowed to write?
    if not isdir(dir_path):
        raise InvalidDirError(dir_path)
    if not access(dir_path, W_OK):
        raise NoWritePermissionError(dir_path)

    base_path = join_path(dir_path, docid)
    txt_path = base_path + '.' + TEXT_FILE_SUFFIX
    ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF

    # Before we proceed, verify that we are not overwriting
    for path in (txt_path, ann_path):
        if isfile(path):
            raise FileExistsError(path)

    # Make sure we have a valid POSIX text file, i.e. that the
    # file ends in a newline.
    if text != "" and text[-1] != '\n':
        text = text + '\n'

    with open_textfile(txt_path, 'w') as txt_file:
        txt_file.write(title + '\n' + text)

    # Touch the ann file so that we can edit the file later
    with open(ann_path, 'w') as _:
        pass

    return { 'document': docid }
Example #18
0
def download_file(document, collection, extension):
    directory = collection
    real_dir = real_directory(directory)
    fname = '%s.%s' % (document, extension)
    fpath = path_join(real_dir, fname)

    hdrs = [('Content-Type', 'text/plain; charset=utf-8'),
            ('Content-Disposition',
                'inline; filename=%s' % fname)]

    #Folia conversion added by Sander Naert
    from brat2folia import convert
    if extension=='xml':
		convert(real_dir, document)
		#convert to folia

    with open_textfile(fpath, 'r') as txt_file:
        data = txt_file.read().encode('utf-8')
    raise NoPrintJSONError(hdrs, data)
Example #19
0
def convert(data, src):
    # Fail early if we don't have a converter
    try:
        conv_text, conv_ann = CONV_BY_SRC[src]
    except KeyError:
        raise InvalidSrcFormat

    # Note: Due to a lack of refactoring we need to write to disk to read
    #   annotions, once this is fixed, the below code needs some clean-up
    tmp_dir = None
    try:
        tmp_dir = mkdtemp()
        doc_base = path_join(tmp_dir, 'tmp')
        with open_textfile(doc_base + '.txt', 'w') as txt_file:
            txt_file.write(conv_text(data))
        with open(doc_base + '.ann', 'w'):
            pass

        with Annotations(doc_base) as ann_obj:
            for ann in conv_ann(data):
                ann_obj.add_annotation(ann)

        json_dic = _document_json_dict(doc_base)
        # Note: Blank the comments, they rarely do anything good but whine
        #   about configuration when we use the tool solely for visualisation
        #   purposes
        json_dic['comments'] = []

        # Note: This is an ugly hack... we want to ride along with the
        #   Stanford tokenisation and sentence splits when returning their
        #   output rather than relying on the ones generated by brat.
        if src.startswith('stanford-'):
            json_dic['token_offsets'] = stanford_token_offsets(data)
            json_dic['sentence_offsets'] = stanford_sentence_offsets(data)

        return json_dic
    finally:
        if tmp_dir is not None:
            rmtree(tmp_dir)
def export_document(document, collection, extension):
    directory = collection
    real_dir = real_directory(directory)
    fname = '%s.%s' % (document, 'txt')
    fpath = path_join(real_dir, fname)
    rr = None
    if allowed_to_read(fpath):
        rr = ReadProject()
        owlfile, ttlfile = rr.read_project(real_dir, document, extension)
        fpaths = owlfile if extension[0:3] == 'owl' else ttlfile
        if extension[-1] == 's':
            fname = '%s.%s' % (document, "zip")
            hdrs = [('Content-Type', 'application/octet-stream'), ('Content-Disposition', 'inline; filename=%s' % fname)]
            from zipfile import ZipFile
            with ZipFile(path_join(real_dir, document) + ".zip", "w") as outfile:
                for f in fpaths:
                    with open(f) as infile:
                        outfile.writestr(f.split('/')[-1], infile.read())
            with open(path_join(real_dir, document) + ".zip", 'rb') as tmp_file:
                data = tmp_file.read()
        else:
            fname = '%s.%s' % (document, extension)
            #hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)]
            hdrs = [('Content-Type', 'application/octet-stream'), ('Content-Disposition', 'inline; filename=%s' % fname)]
            with open_textfile(fpaths, 'r') as txt_file:
                data = txt_file.read().encode('utf-8')
    else:
        data = "Access Denied"

    try:
        raise NoPrintJSONError(hdrs, data)
    finally:
        if rr:
            rr.clean_up()
            if isfile(path_join(real_dir, '%s.%s' % (document, 'zip'))):
                os.remove(path_join(real_dir, '%s.%s' % (document, 'zip')))
Example #21
0
def build_text_structure(ann,txt_file_path):    
    '''
    Will split a text file in paragraphs, sentences and words and return the folia document
    For every word it will check 2 main things:
    1) is the word part of some entities? and if so it will add them to a list of lists of words
    2) is their an entity that ends with this word? if so it will create the entity with the right words out of the list and delete this element after 
    it took the words out. 
    After every sentence, paragraph all the entities that started and ended within that structure will be added into the EntityLayer

    '''
    from annotation import open_textfile
    from tokenise import gtb_token_boundary_gen
    def add_list_entities(struct, folia_entities):
    #will check if any entities have to be added and add if needed
        if folia_entities:
            layer = struct.append(folia.EntitiesLayer)
            for folia_entity in folia_entities:
                
                layer.append(folia_entity)
                for attr in attributes[folia_entity.id]:
                    folia_entity.append(folia.Feature(doc,subset=attr.type, cls=str(attr.value)))
            
    try:
            #Sort entities on offset instead of id        
            entities = sorted(ann.get_textbounds(), key=lambda entity: (entity.start, -entity.end))
            index = 0
            doc = folia.Document(id='brat')
            
            attributes = build_entities_attr(ann)
                    
            folia_text = doc.append(folia.Text)
            paragraph = folia_text.append(folia.Paragraph)
            folia_sentence = 0
            par_start = 0
            #fictive sets
            doc.annotationdefaults[folia.AnnotationType.ENTITY] = {"entiteit_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.ENTITY, "entiteit_set.xml" ) ) 
            doc.annotationdefaults[folia.AnnotationType.MORPHOLOGICAL] = {"morph_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.MORPHOLOGICAL, "morph_set.xml" ) ) 
    
            entity = entities[index]
            entities_words=[]
            inner_index=0
            entities_words.append([])
            
            folia_entitiesLayer_par=[]
            folia_entitiesLayer_sen=[]
            folia_entitiesLayer_txt=[]

            
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
            offsets = [o for o in regex_sentence_boundary_gen(text)]
            for start, end, sentence in _text_by_offsets_gen(text, offsets):
                if start == end and text[start-1] == '\n':
                    add_list_entities(paragraph, folia_entitiesLayer_par)
                    folia_entitiesLayer_par = []
                    paragraph = folia_text.append(folia.Paragraph)
                    par_start = start
                elif sentence != "" :
                    add_list_entities(folia_sentence, folia_entitiesLayer_sen)
                    folia_entitiesLayer_sen = []
                    folia_sentence = paragraph.append(folia.Sentence,sentence)
                offsetsw = [o for o in gtb_token_boundary_gen(sentence)]
                for tok in _text_by_offsets_gen(sentence, offsetsw):
                    entity = entities[index]
                    inner_index=0
                    folia_word = folia_sentence.append(folia.Word, tok[2])
                    morph_layer= ""                
                    #check if word is part of the entity and if so remember folia word
                    while entity.start <= entities[index].end :
                        while( len(entities_words) <= inner_index ):
                                entities_words.append([])
                        for span_start, span_end in entity.spans:                                    
                            if ( span_start <= tok[0]+start and tok[1]+start <= span_end):
                                entities_words[inner_index].append(doc[folia_word.id])
                            #entity ends within the word
                            elif (tok[1]+start >= span_end and span_end > tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                if offset_start <0 :# entity started before this word
                                    offset_start =0;
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                            #entity starts within the word
                            elif (tok[1]+start > span_start and span_start >= tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                        inner_index = inner_index + 1
                        if len(entities) > index + inner_index :
                            entity = entities[index+inner_index]    
                        else:
                            break    
                    entity = entities[index]
                    inner_index = 0    
                    #check for end of an entity and append entity to either text, paragraph or sentece depending on start of the entity    
                    current_index = index
                    while entity.start <= entities[current_index].end :
                        if entity.end <= start + tok[1] and entity.start <= start + tok[0] :
                            if (entity.start >= start):
                                folia_entitiesLayer = folia_entitiesLayer_sen                
                            elif (entity.start >= par_start):
                                folia_entitiesLayer = folia_entitiesLayer_par
                            else:
                                folia_entitiesLayer = folia_entitiesLayer_txt                    
                            if entities_words[inner_index]:        
                                folia_entity = folia.Entity(doc, cls=entity.type, id=entity.id , contents=entities_words[inner_index])
                                folia_entitiesLayer.append(folia_entity)
                            elif not any(x.id == entity.id for x in folia_entitiesLayer):
                                #see if entity is already added
                                try:
                                    doc[entity.id]
                                except KeyError:
                                    raise EntityNotFoundError(entity)
                            if(inner_index == 0):
                                entities_words.pop(0)
                                if len(entities) > index+1 :
                                    index = index + 1
                                    for i in range(0, len(entities_words)):
                                        if(not entities_words[0]):
                                            entities_words.pop(0)
                                            index = index + 1
                                else:
                                    break
                                    
                            elif(inner_index > 0):
                                entities_words[inner_index]=[]
                                inner_index = inner_index + 1                            
                        else:
                            inner_index = inner_index + 1
                        if len(entities) > index + inner_index:
                            entity = entities[index+inner_index]
                        else:
                            break    
            add_list_entities(paragraph, folia_entitiesLayer_par)    
            add_list_entities(folia_sentence, folia_entitiesLayer_sen)
            add_list_entities(folia_text, folia_entitiesLayer_txt)        
            return doc
    except IOError:
        pass # Most likely a broken pipe
Example #22
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error(
                'Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    # TODO XXX huge hack, sorry, the client currently crashing on
    # chrome for two or more consecutive space, so replace every
    # second with literal non-breaking space. Note that this is just
    # for the client display -- server-side storage is not affected.
    # NOTE: it might be possible to fix this in a principled way by
    # having xml:space="preserve" on the relevant elements.
    text = text.replace("  ", ' ' + unichr(0x00A0))

    j_dic['text'] = text

    from logging import info as log_info

    tokeniser = options_get_tokenization(dirname(txt_file_path))

    # First, generate tokenisation
    if tokeniser == 'mecab':
        from tokenise import jp_token_boundary_gen
        tok_offset_gen = jp_token_boundary_gen
    elif tokeniser == 'whitespace':
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    elif tokeniser == 'ptblike':
        from tokenise import gtb_token_boundary_gen
        tok_offset_gen = gtb_token_boundary_gen
    else:
        Messager.warning('Unrecognized tokenisation option '
                         ', reverting to whitespace tokenisation.')
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    j_dic['token_offsets'] = [o for o in tok_offset_gen(text)]

    ssplitter = options_get_ssplitter(dirname(txt_file_path))
    if ssplitter == 'newline':
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    elif ssplitter == 'regex':
        from ssplit import regex_sentence_boundary_gen
        ss_offset_gen = regex_sentence_boundary_gen
    else:
        Messager.warning('Unrecognized sentence splitting option '
                         ', reverting to newline sentence splitting.')
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)]

    return True
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error('Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    # TODO XXX huge hack, sorry, the client currently crashing on
    # chrome for two or more consecutive space, so replace every
    # second with literal non-breaking space. Note that this is just
    # for the client display -- server-side storage is not affected.
    # NOTE: it might be possible to fix this in a principled way by
    # having xml:space="preserve" on the relevant elements.
    text = text.replace("  ", ' '+unichr(0x00A0))

    j_dic['text'] = text
    
    from logging import info as log_info

    tokeniser = options_get_tokenization(dirname(txt_file_path))

    # First, generate tokenisation
    if tokeniser == 'mecab':
        from tokenise import jp_token_boundary_gen
        tok_offset_gen = jp_token_boundary_gen
    elif tokeniser == 'whitespace':
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    elif tokeniser == 'ptblike':
        from tokenise import gtb_token_boundary_gen
        tok_offset_gen = gtb_token_boundary_gen
    else:
        Messager.warning('Unrecognized tokenisation option '
                ', reverting to whitespace tokenisation.')
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    j_dic['token_offsets'] = [o for o in tok_offset_gen(text)]

    ssplitter = options_get_ssplitter(dirname(txt_file_path))
    if ssplitter == 'newline':
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    elif ssplitter == 'regex':
        from ssplit import regex_sentence_boundary_gen
        ss_offset_gen = regex_sentence_boundary_gen
    else:
        Messager.warning('Unrecognized sentence splitting option '
                ', reverting to newline sentence splitting.')
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)]

    return True
def get_directory_information(collection):
    directory = collection

    real_dir = real_directory(directory)
    
    assert_allowed_to_read(real_dir)
    
    # Get the document names
    base_names = [fn[0:-4] for fn in _listdir(real_dir)
            if fn.endswith('txt')]

    doclist = base_names[:]
    doclist_header = [("Document", "string")]

    # Then get the modification times
    doclist_with_time = []
    for file_name in doclist:
        file_path = path_join(DATA_DIR, real_dir,
            file_name + "." + JOINED_ANN_FILE_SUFF)
        doclist_with_time.append([file_name, _getmtime(file_path)])
    doclist = doclist_with_time
    doclist_header.append(("Modified", "time"))

    try:
        stats_types, doc_stats = get_statistics(real_dir, base_names)
    except OSError:
        # something like missing access permissions?
        raise CollectionNotAccessibleError
                
    doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))]
    doclist_header += stats_types

    dirlist = [dir for dir in _listdir(real_dir)
            if isdir(path_join(real_dir, dir))]
    # just in case, and for generality
    dirlist = [[dir] for dir in dirlist]

    # check whether at root, ignoring e.g. possible trailing slashes
    if normpath(real_dir) != normpath(DATA_DIR):
        parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:]
        # to get consistent processing client-side, add explicitly to list
        dirlist.append([".."])
    else:
        parent = None

    # combine document and directory lists, adding a column
    # differentiating files from directories and an unused column (can
    # point to a specific annotation) required by the protocol.  The
    # values filled here for the first are "c" for "collection"
    # (i.e. directory) and "d" for "document".
    combolist = []
    for i in dirlist:
        combolist.append(["c", None]+i)
    for i in doclist:
        combolist.append(["d", None]+i)

    # plug in the search config too
    search_config = get_search_config(real_dir)

    # ... and the disambiguator config ... this is getting a bit much
    disambiguator_config = get_disambiguator_config(real_dir)

    # ... and the normalization config (TODO: rethink)
    normalization_config = get_normalization_config(real_dir)

    # read in README (if any) to send as a description of the
    # collection
    try:
        with open_textfile(path_join(real_dir, "README")) as txt_file:
            readme_text = txt_file.read()
    except IOError:
        readme_text = None

    # fill in a flag for whether annotator logging is active so that
    # the client knows whether to invoke timing actions
    ann_logging = annotation_logging_active(real_dir)

    # fill in NER services, if any
    ner_taggers = get_annotator_config(real_dir)

    return _inject_annotation_type_conf(real_dir, json_dic={
            'items': combolist,
            'header' : doclist_header,
            'parent': parent,
            'messages': [],
            'description': readme_text,
            'search_config': search_config,
            'disambiguator_config' : disambiguator_config,
            'normalization_config' : normalization_config,
            'annotation_logging': ann_logging,
            'ner_taggers': ner_taggers,
            })
Example #25
0
if __name__ == '__main__':
    from sys import argv

    from annotation import open_textfile

    def _text_by_offsets_gen(text, offsets):
        for start, end in offsets:
            yield text[start:end]

    if len(argv) > 1:
        try:
            for txt_file_path in argv[1:]:
                print
                print '### Splitting:', txt_file_path
                with open_textfile(txt_file_path, 'r') as txt_file:
                    text = txt_file.read()
                print '# Original text:'
                print text.replace('\n', '\\n')
                offsets = [o for o in newline_sentence_boundary_gen(text)]
                print '# Offsets:'
                print offsets
                print '# Sentences:'
                for sentence in _text_by_offsets_gen(text, offsets):
                    # These should only be allowed when coming from original
                    #   explicit newlines.
                    #assert sentence, 'blank sentences disallowed'
                    #assert not sentence[0].isspace(), (
                    #        'sentence may not start with white-space "%s"' % sentence)
                    print '"%s"' % sentence.replace('\n', '\\n')
        except IOError:
def __create_span(ann_obj, mods, type, offsets, txt_file_path,
        projectconf, attributes):
    # For event types, reuse trigger if a matching one exists.
    found = None
    if projectconf.is_event_type(type):
        for tb_ann in ann_obj.get_textbounds():
            try:
                if (_offsets_equal(tb_ann.spans, offsets)
                    and tb_ann.type == type):
                    found = tb_ann
                    break
            except AttributeError:
                # Not a trigger then
                pass
        
    if found is None:
        # Get a new ID
        new_id = ann_obj.get_new_id('T') #XXX: Cons
        # Get the text span
        with open_textfile(txt_file_path, 'r') as txt_file:
            # TODO discont: use offsets instead (note need for int conversion)
            text = _text_for_offsets(txt_file.read(), offsets)

        # The below code resolves cases where there are newlines in the
        #   offsets by creating discontinuous annotations for each span
        #   separated by newlines. For most cases it preserves the offsets.
        seg_offsets = []
        for o_start, o_end in offsets:
            pos = o_start
            for text_seg in text.split('\n'):
                if not text_seg:
                    # Double new-line, skip ahead
                    pos += 1
                    continue
                end = pos + len(text_seg)
                seg_offsets.append((pos, end))
                # Our current position is after the newline
                pos = end + 1

        ann = TextBoundAnnotationWithText(seg_offsets, new_id, type,
                # Replace any newlines with the discontinuous separator
                MUL_NL_REGEX.sub(DISCONT_SEP, text))
        ann_obj.add_annotation(ann)
        mods.addition(ann)
    else:
        ann = found

    if ann is not None:
        if projectconf.is_physical_entity_type(type):
            # TODO: alert that negation / speculation are ignored if set
            event = None
        else:
            # Create the event also
            new_event_id = ann_obj.get_new_id('E') #XXX: Cons
            event = EventAnnotation(ann.id, [], unicode(new_event_id), type, '')
            ann_obj.add_annotation(event)
            mods.addition(event)
    else:
        # We got a newline in the span, don't take any action
        event = None

    return ann, event
Example #27
0
def get_directory_information(collection):
    directory = collection

    real_dir = real_directory(directory)

    assert_allowed_to_read(real_dir)

    # Get the document names
    base_names = [fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')]

    doclist = base_names[:]
    doclist_header = [("Document", "string")]

    # Then get the modification times
    from os.path import getmtime, join
    doclist_with_time = []
    for file in doclist:
        try:
            from annotation import JOINED_ANN_FILE_SUFF
            mtime = getmtime(
                join(DATA_DIR, join(real_dir,
                                    file + "." + JOINED_ANN_FILE_SUFF)))
        except:
            # The file did not exist (or similar problem)
            mtime = -1
        doclist_with_time.append([file, mtime])
    doclist = doclist_with_time
    doclist_header.append(("Modified", "time"))

    try:
        stats_types, doc_stats = get_statistics(real_dir, base_names)
    except OSError:
        # something like missing access permissions?
        raise CollectionNotAccessibleError

    doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))]
    doclist_header += stats_types

    dirlist = [
        dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir))
    ]
    # just in case, and for generality
    dirlist = [[dir] for dir in dirlist]

    if real_dir != DATA_DIR:
        parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:]
        # to get consistent processing client-side, add explicitly to list
        dirlist.append([".."])
    else:
        parent = None

    # combine document and directory lists, adding a column
    # differentiating files from directories and an unused column (can
    # point to a specific annotation) required by the protocol.  The
    # values filled here for the first are "c" for "collection"
    # (i.e. directory) and "d" for "document".
    combolist = []
    for i in dirlist:
        combolist.append(["c", None] + i)
    for i in doclist:
        combolist.append(["d", None] + i)

    event_types, entity_types, attribute_types, relation_types, unconf_types = get_span_types(
        real_dir)

    # read in README (if any) to send as a description of the
    # collection
    try:
        with open_textfile(path_join(real_dir, "README")) as txt_file:
            readme_text = txt_file.read()
    except IOError:
        readme_text = None

    json_dic = {
        'items': combolist,
        'header': doclist_header,
        'parent': parent,
        'messages': [],
        'event_types': event_types,
        'entity_types': entity_types,
        'attribute_types': attribute_types,
        'relation_types': relation_types,
        'unconfigured_types': unconf_types,
        'description': readme_text,
    }
    return json_dic
Example #28
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            (filepath, tempfilename) = os.path.split(txt_file_path)
            (filename, extension) = os.path.splitext(tempfilename)
            r = RepoModel(filepath)
            r.save_xml(filepath)
            # xml_save(filepath, filename, filename)
            xml_file_path = os.path.join(filepath, filename+'.xml')
            # print("xml_file_path::::", r, file=sys.stderr)
            # if xml_file_path:
            #     pass
            # else:
            #     xml_save(filepath, filename, filename)
            with open(xml_file_path, 'r') as xml_file:
                xml = xml_file.read()
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
            j_dic['xml'] = xml

        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error(
                'Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    j_dic['text'] = text


    tokeniser = options_get_tokenization(dirname(txt_file_path))

    # First, generate tokenisation
    if tokeniser == 'mecab':
        from tokenise import jp_token_boundary_gen
        tok_offset_gen = jp_token_boundary_gen
    elif tokeniser == 'whitespace':
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    elif tokeniser == 'ptblike':
        from tokenise import gtb_token_boundary_gen
        tok_offset_gen = gtb_token_boundary_gen
    else:
        Messager.warning('Unrecognized tokenisation option '
                         ', reverting to whitespace tokenisation.')
        from tokenise import whitespace_token_boundary_gen
        tok_offset_gen = whitespace_token_boundary_gen
    j_dic['token_offsets'] = [o for o in tok_offset_gen(text)]

    ssplitter = options_get_ssplitter(dirname(txt_file_path))
    if ssplitter == 'newline':
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    elif ssplitter == 'regex':
        from ssplit import regex_sentence_boundary_gen
        ss_offset_gen = regex_sentence_boundary_gen
    else:
        Messager.warning('Unrecognized sentence splitting option '
                         ', reverting to newline sentence splitting.')
        from ssplit import newline_sentence_boundary_gen
        ss_offset_gen = newline_sentence_boundary_gen
    j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)]

    return True
Example #29
0
def get_directory_information(collection):
    directory = collection
    real_dir = real_directory(directory)
    assert_allowed_to_read(real_dir)

    # Get the document names
    user = get_session().get('user')
    if user is None or user == 'guest':
        base_names = []
    # # 可以从配置文件获取用户。
    elif user in USER_PASSWORD:
        base_names = [fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')]
    else:
        db = DBlite()
        base_names = db.get_AnnNull_files(directory)
        names_ING = db.get_AnnING_files(directory, user)
        print("names_ING", names_ING, file=sys.stderr)
        base_names.extend(names_ING)


    doclist = base_names[:]
    doclist_header = [("文档", "string")]

    # Then get the modification times
    doclist_with_time = []
    for file_name in doclist:
        file_path = path_join(DATA_DIR, real_dir,
                              file_name + "." + JOINED_ANN_FILE_SUFF)
        doclist_with_time.append([file_name, _getmtime(file_path)])
    doclist = doclist_with_time
    doclist_header.append(("修改时间", "time"))

    """
        stats_types: [('Entities', 'int'), ('Relations', 'int'), ('Events', 'int')]
        doc_stats: [[29, 0, 0], [97, 0, 0], [22, 0, 0], [8, 0, 0], [17, 0, 0], [22, 0, 0], [14, 0, 0], [24, 0, 0], [22, 0, 0], [21, 0, 0]]
        doclist: [['ned.train-doc-184', 1555259780.624325, 29, 0, 0], ['ned.train-doc-181', 1555259780.623239, 97, 0, 0], ['ned.train-doc-236'
    """
    try:
        stats_types, doc_stats = get_statistics(real_dir, base_names)
        print("stats_types:", stats_types, file=sys.stderr)
        print("doc_stats:", doc_stats, file=sys.stderr)
    except OSError:
        # something like missing access permissions?
        raise CollectionNotAccessibleError

    doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))]
    print("doclist:", doclist, file=sys.stderr)
    doclist_header += stats_types
    # doclist_header.append(("修改者", "string"))
    print("doclist_header:", doclist_header, file=sys.stderr)

    if user is None or user == 'guest':
        dirlist = []
    elif user in USER_PASSWORD:
        dirlist = [dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir))]
    else: # for user ACL
        dirlist = [dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir))]
    # just in case, and for generality
    dirlist = [[dir] for dir in dirlist]
    # print("---------------dirlist------------------", dirlist, file=sys.stderr)
    # 打开最后的文件目录结构时出现
    # 文件名  修改时间   实体 关系 事件
    # [['esp.train-doc-46', 1555259780.6167455, 104, 0, 0], ['esp.train-doc-989', 1555259780.6174483, 34, 0, 0],
    # print(doclist, file=sys.stderr)

    # check whether at root, ignoring e.g. possible trailing slashes
    if normpath(real_dir) != normpath(DATA_DIR):
        parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:]
        # to get consistent processing client-side, add explicitly to list
        dirlist.append([".."])
    else:
        parent = None

    # combine document and directory lists, adding a column
    # differentiating files from directories and an unused column (can
    # point to a specific annotation) required by the protocol.  The
    # values filled here for the first are "c" for "collection"
    # (i.e. directory) and "d" for "document".
    combolist = []
    for i in dirlist:
        combolist.append(["c", None] + i)
    for i in doclist:
        combolist.append(["d", None] + i)

    # plug in the search config too
    search_config = get_search_config(real_dir)

    # ... and the disambiguator config ... this is getting a bit much
    disambiguator_config = get_disambiguator_config(real_dir)

    # ... and the normalization config (TODO: rethink)
    normalization_config = get_normalization_config(real_dir)

    # read in README (if any) to send as a description of the
    # collection
    try:
        with open_textfile(path_join(real_dir, "README")) as txt_file:
            readme_text = txt_file.read()
    except IOError:
        readme_text = None

    # fill in a flag for whether annotator logging is active so that
    # the client knows whether to invoke timing actions
    ann_logging = annotation_logging_active(real_dir)

    # fill in NER services, if any
    ner_taggers = get_annotator_config(real_dir)

    return _inject_annotation_type_conf(real_dir, json_dic={
        'items': combolist,
        'header': doclist_header,
        'parent': parent,
        'messages': [],
        'description': readme_text,
        'search_config': search_config,
        'disambiguator_config': disambiguator_config,
        'normalization_config': normalization_config,
        'annotation_logging': ann_logging,
        'ner_taggers': ner_taggers,
    })
Example #30
0
def build_text_structure(ann,txt_file_path):
    '''
    Will split a text file in paragraphs, sentences and words and return the folia document
    For every word it will check 2 main things:
    1) is the word part of some entities? and if so it will add them to a list of lists of words
    2) is their an entity that ends with this word? if so it will create the entity with the right words out of the list and delete this element after
    it took the words out.
    After every sentence, paragraph all the entities that started and ended within that structure will be added into the EntityLayer

    '''
    from annotation import open_textfile
    from tokenise import gtb_token_boundary_gen
    def add_list_entities(struct, folia_entities):
    #will check if any entities have to be added and add if needed
        if folia_entities:
            layer = struct.append(folia.EntitiesLayer)
            for folia_entity in folia_entities:

                layer.append(folia_entity)
                for attr in attributes[folia_entity.id]:
                    folia_entity.append(folia.Feature(doc,subset=attr.type, cls=str(attr.value)))

    try:
            #Sort entities on offset instead of id
            entities = sorted(ann.get_textbounds(), key=lambda entity: (entity.start, -entity.end))
            index = 0
            doc = folia.Document(id='brat')

            attributes = build_entities_attr(ann)

            folia_text = doc.append(folia.Text)
            paragraph = folia_text.append(folia.Paragraph)
            folia_sentence = 0
            par_start = 0
            #fictive sets
            doc.annotationdefaults[folia.AnnotationType.ENTITY] = {"entiteit_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.ENTITY, "entiteit_set.xml" ) )
            doc.annotationdefaults[folia.AnnotationType.MORPHOLOGICAL] = {"morph_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.MORPHOLOGICAL, "morph_set.xml" ) )

            entity = entities[index]
            entities_words=[]
            inner_index=0
            entities_words.append([])

            folia_entitiesLayer_par=[]
            folia_entitiesLayer_sen=[]
            folia_entitiesLayer_txt=[]


            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
            offsets = [o for o in regex_sentence_boundary_gen(text)]
            for start, end, sentence in _text_by_offsets_gen(text, offsets):
                if start == end and text[start-1] == '\n':
                    add_list_entities(paragraph, folia_entitiesLayer_par)
                    folia_entitiesLayer_par = []
                    paragraph = folia_text.append(folia.Paragraph)
                    par_start = start
                elif sentence != "" :
                    add_list_entities(folia_sentence, folia_entitiesLayer_sen)
                    folia_entitiesLayer_sen = []
                    folia_sentence = paragraph.append(folia.Sentence,sentence)
                offsetsw = [o for o in gtb_token_boundary_gen(sentence)]
                for tok in _text_by_offsets_gen(sentence, offsetsw):
                    entity = entities[index]
                    inner_index=0
                    folia_word = folia_sentence.append(folia.Word, tok[2])
                    morph_layer= ""
                    #check if word is part of the entity and if so remember folia word
                    while entity.start <= entities[index].end :
                        while( len(entities_words) <= inner_index ):
                                entities_words.append([])
                        for span_start, span_end in entity.spans:
                            if ( span_start <= tok[0]+start and tok[1]+start <= span_end):
                                entities_words[inner_index].append(doc[folia_word.id])
                            #entity ends within the word
                            elif (tok[1]+start >= span_end and span_end > tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                if offset_start <0 :# entity started before this word
                                    offset_start =0;
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                            #entity starts within the word
                            elif (tok[1]+start > span_start and span_start >= tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                        inner_index = inner_index + 1
                        if len(entities) > index + inner_index :
                            entity = entities[index+inner_index]
                        else:
                            break
                    entity = entities[index]
                    inner_index = 0
                    #check for end of an entity and append entity to either text, paragraph or sentece depending on start of the entity
                    current_index = index
                    while entity.start <= entities[current_index].end :
                        if entity.end <= start + tok[1] and entity.start <= start + tok[0] :
                            if (entity.start >= start):
                                folia_entitiesLayer = folia_entitiesLayer_sen
                            elif (entity.start >= par_start):
                                folia_entitiesLayer = folia_entitiesLayer_par
                            else:
                                folia_entitiesLayer = folia_entitiesLayer_txt
                            if entities_words[inner_index]:
                                folia_entity = folia.Entity(doc, cls=entity.type, id=entity.id , contents=entities_words[inner_index])
                                folia_entitiesLayer.append(folia_entity)
                            elif not any(x.id == entity.id for x in folia_entitiesLayer):
                                #see if entity is already added
                                try:
                                    doc[entity.id]
                                except KeyError:
                                    raise EntityNotFoundError(entity)
                            if(inner_index == 0):
                                entities_words.pop(0)
                                if len(entities) > index+1 :
                                    index = index + 1
                                    for i in range(0, len(entities_words)):
                                        if(not entities_words[0]):
                                            entities_words.pop(0)
                                            index = index + 1
                                else:
                                    break

                            elif(inner_index > 0):
                                entities_words[inner_index]=[]
                                inner_index = inner_index + 1
                        else:
                            inner_index = inner_index + 1
                        if len(entities) > index + inner_index:
                            entity = entities[index+inner_index]
                        else:
                            break
            add_list_entities(paragraph, folia_entitiesLayer_par)
            add_list_entities(folia_sentence, folia_entitiesLayer_sen)
            add_list_entities(folia_text, folia_entitiesLayer_txt)
            return doc
    except IOError:
        pass # Most likely a broken pipe
Example #31
0
if __name__ == "__main__":
    from sys import argv

    from annotation import open_textfile

    def _text_by_offsets_gen(text, offsets):
        for start, end in offsets:
            yield text[start:end]

    if len(argv) > 1:
        try:
            for txt_file_path in argv[1:]:
                print
                print "### Splitting:", txt_file_path
                with open_textfile(txt_file_path, "r") as txt_file:
                    text = txt_file.read()
                print "# Original text:"
                print text.replace("\n", "\\n")
                offsets = [o for o in newline_sentence_boundary_gen(text)]
                print "# Offsets:"
                print offsets
                print "# Sentences:"
                for sentence in _text_by_offsets_gen(text, offsets):
                    # These should only be allowed when coming from original
                    #   explicit newlines.
                    # assert sentence, 'blank sentences disallowed'
                    # assert not sentence[0].isspace(), (
                    #        'sentence may not start with white-space "%s"' % sentence)
                    print '"%s"' % sentence.replace("\n", "\\n")
        except IOError:
def save_import(text, docid, collection=None, anntext = None):
    '''
    TODO: DOC:
    '''
    
    if len(docid) > 4 and docid[-4] == '.':
        docid = docid[:-4]

    directory = collection

    if directory is None:
        dir_path = DATA_DIR
    else:
        #XXX: These "security" measures can surely be fooled
        if (directory.count('../') or directory == '..'):
            raise InvalidDirError(directory)

        dir_path = real_directory(directory)

    # Is the directory a directory and are we allowed to write?
    if not isdir(dir_path):
        raise InvalidDirError(dir_path)
    if not access(dir_path, W_OK):
        raise NoWritePermissionError(dir_path)

    ############################
    from session import get_session
    try:
        username = get_session()['user']
    except KeyError:
        username = None
    if username != 'admin':
        if (not username) or username + '/' not in dir_path:
            raise NoWritePermissionError(dir_path)
    ############################

    base_path = join_path(dir_path, docid)
    txt_path = base_path + '.' + TEXT_FILE_SUFFIX
    ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF

    # Before we proceed, verify that we are not overwriting
    for path in (txt_path, ann_path):
        if isfile(path):
            raise FileExistsError(path)

    # Make sure we have a valid POSIX text file, i.e. that the
    # file ends in a newline.
    newtext = ''
    for line in text.splitlines():
        if line:
            newtext += line + '\n'
    text = newtext
    if text != "" and text[-1] != '\n':
        text = text + '\n'

    with open_textfile(txt_path, 'w') as txt_file:
        txt_file.write(text)

    if anntext:
        with open(ann_path, 'w') as ann_file:
            ann_file.write(anntext)
    else:
        # Touch the ann file so that we can edit the file later
        with open(ann_path, 'w') as _:
            pass

    return { 'document': docid }
Example #33
0
def __create_span(ann_obj, mods, type, offsets, txt_file_path,
                  projectconf, attributes):
    # For event types, reuse trigger if a matching one exists.
    found = None
    if projectconf.is_event_type(type):
        for tb_ann in ann_obj.get_textbounds():
            try:
                if (_offsets_equal(tb_ann.spans, offsets)
                        and tb_ann.type == type):
                    found = tb_ann
                    break
            except AttributeError:
                # Not a trigger then
                pass

    if found is None:
        # Get a new ID
        new_id = ann_obj.get_new_id('T')  # XXX: Cons
        # Get the text span
        with open_textfile(txt_file_path, 'r') as txt_file:
            text = txt_file.read()
            text_span = _text_for_offsets(text, offsets)

        # The below code resolves cases where there are newlines in the
        #   offsets by creating discontinuous annotations for each span
        #   separated by newlines. For most cases it preserves the offsets.
        seg_offsets = []
        for o_start, o_end in offsets:
            pos = o_start
            for text_seg in text_span.split('\n'):
                if not text_seg and o_start != o_end:
                    # Double new-line, skip ahead
                    pos += 1
                    continue
                start = pos
                end = start + len(text_seg)

                # For the next iteration the position is after the newline.
                pos = end + 1

                # Adjust the offsets to compensate for any potential leading
                #   and trailing whitespace.
                start += len(text_seg) - len(text_seg.lstrip())
                end -= len(text_seg) - len(text_seg.rstrip())

                # If there is any segment left, add it to the offsets.
                if start != end:
                    seg_offsets.append((start, end, ))

        # if we're dealing with a null-span
        if not seg_offsets:
            seg_offsets = offsets

        ann_text = DISCONT_SEP.join((text[start:end]
                                     for start, end in seg_offsets))
        ann = TextBoundAnnotationWithText(seg_offsets, new_id, type, ann_text)
        ann_obj.add_annotation(ann)
        mods.addition(ann)
    else:
        ann = found

    if ann is not None:
        if projectconf.is_physical_entity_type(type):
            # TODO: alert that negation / speculation are ignored if set
            event = None
        else:
            # Create the event also
            new_event_id = ann_obj.get_new_id('E')  # XXX: Cons
            event = EventAnnotation(
                ann.id, [], str(new_event_id), type, '')
            ann_obj.add_annotation(event)
            mods.addition(event)
    else:
        # We got a newline in the span, don't take any action
        event = None

    return ann, event
Example #34
0
def __create_span(ann_obj, mods, type, offsets, txt_file_path, projectconf,
                  attributes):
    # For event types, reuse trigger if a matching one exists.
    found = None
    if projectconf.is_event_type(type):
        for tb_ann in ann_obj.get_textbounds():
            try:
                if (_offsets_equal(tb_ann.spans, offsets)
                        and tb_ann.type == type):
                    found = tb_ann
                    break
            except AttributeError:
                # Not a trigger then
                pass

    if found is None:
        # Get a new ID
        new_id = ann_obj.get_new_id('T')  #XXX: Cons
        # Get the text span
        with open_textfile(txt_file_path, 'r') as txt_file:
            # TODO discont: use offsets instead (note need for int conversion)
            text = _text_for_offsets(txt_file.read(), offsets)

        # The below code resolves cases where there are newlines in the
        #   offsets by creating discontinuous annotations for each span
        #   separated by newlines. For most cases it preserves the offsets.
        seg_offsets = []
        for o_start, o_end in offsets:
            pos = o_start
            for text_seg in text.split('\n'):
                if not text_seg and o_start != o_end:
                    # Double new-line, skip ahead
                    pos += 1
                    continue
                end = pos + len(text_seg)
                seg_offsets.append((pos, end))
                # Our current position is after the newline
                pos = end + 1

        ann = TextBoundAnnotationWithText(
            seg_offsets,
            new_id,
            type,
            # Replace any newlines with the discontinuous separator
            MUL_NL_REGEX.sub(DISCONT_SEP, text))
        ann_obj.add_annotation(ann)
        mods.addition(ann)
    else:
        ann = found

    if ann is not None:
        if projectconf.is_physical_entity_type(type):
            # TODO: alert that negation / speculation are ignored if set
            event = None
        else:
            # Create the event also
            new_event_id = ann_obj.get_new_id('E')  #XXX: Cons
            event = EventAnnotation(ann.id, [], unicode(new_event_id), type,
                                    '')
            ann_obj.add_annotation(event)
            mods.addition(event)
    else:
        # We got a newline in the span, don't take any action
        event = None

    return ann, event
Example #35
0
def get_directory_information(collection):
    directory = collection

    real_dir = real_directory(directory)

    assert_allowed_to_read(real_dir)

    # Get the document names
    base_names = [fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')]

    doclist = base_names[:]
    doclist_header = [("Document", "string")]

    # Then get the modification times
    doclist_with_time = []
    for file_name in doclist:
        file_path = path_join(DATA_DIR, real_dir,
                              file_name + "." + JOINED_ANN_FILE_SUFF)
        doclist_with_time.append([file_name, _getmtime(file_path)])
    doclist = doclist_with_time
    doclist_header.append(("Modified", "time"))

    try:
        stats_types, doc_stats = get_statistics(real_dir, base_names)
    except OSError:
        # something like missing access permissions?
        raise CollectionNotAccessibleError

    doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))]
    doclist_header += stats_types

    dirlist = [
        dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir))
    ]
    # just in case, and for generality
    dirlist = [[dir] for dir in dirlist]

    # check whether at root, ignoring e.g. possible trailing slashes
    if normpath(real_dir) != normpath(DATA_DIR):
        parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:]
        # to get consistent processing client-side, add explicitly to list
        dirlist.append([".."])
    else:
        parent = None

    # combine document and directory lists, adding a column
    # differentiating files from directories and an unused column (can
    # point to a specific annotation) required by the protocol.  The
    # values filled here for the first are "c" for "collection"
    # (i.e. directory) and "d" for "document".
    combolist = []
    for i in dirlist:
        combolist.append(["c", None] + i)
    for i in doclist:
        combolist.append(["d", None] + i)

    # plug in the search config too
    search_config = get_search_config(real_dir)

    # ... and the disambiguator config ... this is getting a bit much
    disambiguator_config = get_disambiguator_config(real_dir)

    # ... and the normalization config (TODO: rethink)
    normalization_config = get_normalization_config(real_dir)

    # read in README (if any) to send as a description of the
    # collection
    try:
        with open_textfile(path_join(real_dir, "README")) as txt_file:
            readme_text = txt_file.read()
    except IOError:
        readme_text = None

    # fill in a flag for whether annotator logging is active so that
    # the client knows whether to invoke timing actions
    ann_logging = annotation_logging_active(real_dir)

    # fill in NER services, if any
    ner_taggers = get_annotator_config(real_dir)

    #send logging directory:
    logging = options_get_annlogfile(real_dir)

    return _inject_annotation_type_conf(real_dir,
                                        json_dic={
                                            'items': combolist,
                                            'header': doclist_header,
                                            'parent': parent,
                                            'messages': [],
                                            'description': readme_text,
                                            'search_config': search_config,
                                            'disambiguator_config':
                                            disambiguator_config,
                                            'normalization_config':
                                            normalization_config,
                                            'annotation_logging': ann_logging,
                                            'ner_taggers': ner_taggers,
                                            'logging': logging,
                                        })
Example #36
0
def get_directory_information(collection):
    directory = collection

    real_dir = real_directory(directory)
    
    assert_allowed_to_read(real_dir)
    
    # Get the document names
    base_names = [fn[0:-4] for fn in _listdir(real_dir)
            if fn.endswith('txt')]

    doclist = base_names[:]
    doclist_header = [("Document", "string")]

    # Then get the modification times
    from os.path import getmtime, join
    doclist_with_time = []
    for file in doclist:
        try:
            from annotation import JOINED_ANN_FILE_SUFF
            mtime = getmtime(join(DATA_DIR,
                join(real_dir, file + "." + JOINED_ANN_FILE_SUFF)))
        except:
            # The file did not exist (or similar problem)
            mtime = -1
        doclist_with_time.append([file, mtime])
    doclist = doclist_with_time
    doclist_header.append(("Modified", "time"))

    try:
        stats_types, doc_stats = get_statistics(real_dir, base_names)
    except OSError:
        # something like missing access permissions?
        raise CollectionNotAccessibleError
                
    doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))]
    doclist_header += stats_types

    dirlist = [dir for dir in _listdir(real_dir)
            if isdir(path_join(real_dir, dir))]
    # just in case, and for generality
    dirlist = [[dir] for dir in dirlist]

    if real_dir != DATA_DIR:
        parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:]
        # to get consistent processing client-side, add explicitly to list
        dirlist.append([".."])
    else:
        parent = None

    # combine document and directory lists, adding a column
    # differentiating files from directories and an unused column (can
    # point to a specific annotation) required by the protocol.  The
    # values filled here for the first are "c" for "collection"
    # (i.e. directory) and "d" for "document".
    combolist = []
    for i in dirlist:
        combolist.append(["c", None]+i)
    for i in doclist:
        combolist.append(["d", None]+i)

    event_types, entity_types, attribute_types, relation_types, unconf_types = get_span_types(real_dir)

    # read in README (if any) to send as a description of the
    # collection
    try:
        with open_textfile(path_join(real_dir, "README")) as txt_file:
            readme_text = txt_file.read()
    except IOError:
        readme_text = None

    json_dic = {
            'items': combolist,
            'header' : doclist_header,
            'parent': parent,
            'messages': [],
            'event_types': event_types,
            'entity_types': entity_types,
            'attribute_types': attribute_types,
            'relation_types': relation_types,
            'unconfigured_types': unconf_types,
            'description': readme_text,
            }
    return json_dic
Example #37
0
if __name__ == '__main__':
    from sys import argv

    from annotation import open_textfile

    def _text_by_offsets_gen(text, offsets):
        for start, end in offsets:
            yield text[start:end]

    if len(argv) > 1:
        try:
            for txt_file_path in argv[1:]:
                print
                print '### Splitting:', txt_file_path
                with open_textfile(txt_file_path, 'r') as txt_file:
                    text = txt_file.read()
                print '# Original text:'
                print text.replace('\n', '\\n')
                offsets = [o for o in en_sentence_boundary_gen(text)]
                print '# Offsets:'
                print offsets
                print '# Sentences:'
                for sentence in _text_by_offsets_gen(text, offsets):
                    # These should only be allowed when coming from original
                    #   explicit newlines.
                    #assert sentence, 'blank sentences disallowed'
                    #assert not sentence[0].isspace(), (
                    #        'sentence may not start with white-space "%s"' % sentence)
                    print '"%s"' % sentence.replace('\n', '\\n')
        except IOError:
Example #38
0
    # location = join_path(dir_path, 'input.json')
    # data = getFileData(location)

    try:
        json_resp = loads(data)
    except ValueError, e:
        raise FormatError(apiUrl, e)

    # Make sure we have a valid POSIX text file, i.e. that the
    # file ends in a newline.
    response = json_resp[1]
    text = response['doc']
    if text != "" and text[-1] != '\n':
        text = text + '\n'

    with open_textfile(txt_path, 'w') as txt_file:
        txt_file.write(text)

    annotations = ""
    index = 1
    for sentence in response['annotatedSentences']:
        for annotation in sentence['spans']:
            if len(annotation['tokens']) > 0:
                token = annotation['tokens'][0]

                type = token['namedEntity']
                if len(annotation['annotations']) > 0:
                    type = annotation['annotations'].keys()[0].split('.')[-1]

                annotations += 'T' + str(index) + '\t' + str(type) + ' ' + str(
                    token['start']) + ' ' + str(token['end']) + '\t' + str(