Beispiel #1
0
def word_positions(filename):
    with codecs.open(filename) as infile:
        text = infile.read()
    return [o for o in gtb_token_boundary_gen(text)]
Beispiel #2
0
def build_text_structure(ann,txt_file_path):    
    '''
    Will split a text file in paragraphs, sentences and words and return the folia document
    For every word it will check 2 main things:
    1) is the word part of some entities? and if so it will add them to a list of lists of words
    2) is their an entity that ends with this word? if so it will create the entity with the right words out of the list and delete this element after 
    it took the words out. 
    After every sentence, paragraph all the entities that started and ended within that structure will be added into the EntityLayer

    '''
    from annotation import open_textfile
    from tokenise import gtb_token_boundary_gen
    def add_list_entities(struct, folia_entities):
    #will check if any entities have to be added and add if needed
        if folia_entities:
            layer = struct.append(folia.EntitiesLayer)
            for folia_entity in folia_entities:
                
                layer.append(folia_entity)
                for attr in attributes[folia_entity.id]:
                    folia_entity.append(folia.Feature(doc,subset=attr.type, cls=str(attr.value)))
            
    try:
            #Sort entities on offset instead of id        
            entities = sorted(ann.get_textbounds(), key=lambda entity: (entity.start, -entity.end))
            index = 0
            doc = folia.Document(id='brat')
            
            attributes = build_entities_attr(ann)
                    
            folia_text = doc.append(folia.Text)
            paragraph = folia_text.append(folia.Paragraph)
            folia_sentence = 0
            par_start = 0
            #fictive sets
            doc.annotationdefaults[folia.AnnotationType.ENTITY] = {"entiteit_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.ENTITY, "entiteit_set.xml" ) ) 
            doc.annotationdefaults[folia.AnnotationType.MORPHOLOGICAL] = {"morph_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.MORPHOLOGICAL, "morph_set.xml" ) ) 
    
            entity = entities[index]
            entities_words=[]
            inner_index=0
            entities_words.append([])
            
            folia_entitiesLayer_par=[]
            folia_entitiesLayer_sen=[]
            folia_entitiesLayer_txt=[]

            
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
            offsets = [o for o in regex_sentence_boundary_gen(text)]
            for start, end, sentence in _text_by_offsets_gen(text, offsets):
                if start == end and text[start-1] == '\n':
                    add_list_entities(paragraph, folia_entitiesLayer_par)
                    folia_entitiesLayer_par = []
                    paragraph = folia_text.append(folia.Paragraph)
                    par_start = start
                elif sentence != "" :
                    add_list_entities(folia_sentence, folia_entitiesLayer_sen)
                    folia_entitiesLayer_sen = []
                    folia_sentence = paragraph.append(folia.Sentence,sentence)
                offsetsw = [o for o in gtb_token_boundary_gen(sentence)]
                for tok in _text_by_offsets_gen(sentence, offsetsw):
                    entity = entities[index]
                    inner_index=0
                    folia_word = folia_sentence.append(folia.Word, tok[2])
                    morph_layer= ""                
                    #check if word is part of the entity and if so remember folia word
                    while entity.start <= entities[index].end :
                        while( len(entities_words) <= inner_index ):
                                entities_words.append([])
                        for span_start, span_end in entity.spans:                                    
                            if ( span_start <= tok[0]+start and tok[1]+start <= span_end):
                                entities_words[inner_index].append(doc[folia_word.id])
                            #entity ends within the word
                            elif (tok[1]+start >= span_end and span_end > tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                if offset_start <0 :# entity started before this word
                                    offset_start =0;
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                            #entity starts within the word
                            elif (tok[1]+start > span_start and span_start >= tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                        inner_index = inner_index + 1
                        if len(entities) > index + inner_index :
                            entity = entities[index+inner_index]    
                        else:
                            break    
                    entity = entities[index]
                    inner_index = 0    
                    #check for end of an entity and append entity to either text, paragraph or sentece depending on start of the entity    
                    current_index = index
                    while entity.start <= entities[current_index].end :
                        if entity.end <= start + tok[1] and entity.start <= start + tok[0] :
                            if (entity.start >= start):
                                folia_entitiesLayer = folia_entitiesLayer_sen                
                            elif (entity.start >= par_start):
                                folia_entitiesLayer = folia_entitiesLayer_par
                            else:
                                folia_entitiesLayer = folia_entitiesLayer_txt                    
                            if entities_words[inner_index]:        
                                folia_entity = folia.Entity(doc, cls=entity.type, id=entity.id , contents=entities_words[inner_index])
                                folia_entitiesLayer.append(folia_entity)
                            elif not any(x.id == entity.id for x in folia_entitiesLayer):
                                #see if entity is already added
                                try:
                                    doc[entity.id]
                                except KeyError:
                                    raise EntityNotFoundError(entity)
                            if(inner_index == 0):
                                entities_words.pop(0)
                                if len(entities) > index+1 :
                                    index = index + 1
                                    for i in range(0, len(entities_words)):
                                        if(not entities_words[0]):
                                            entities_words.pop(0)
                                            index = index + 1
                                else:
                                    break
                                    
                            elif(inner_index > 0):
                                entities_words[inner_index]=[]
                                inner_index = inner_index + 1                            
                        else:
                            inner_index = inner_index + 1
                        if len(entities) > index + inner_index:
                            entity = entities[index+inner_index]
                        else:
                            break    
            add_list_entities(paragraph, folia_entitiesLayer_par)    
            add_list_entities(folia_sentence, folia_entitiesLayer_sen)
            add_list_entities(folia_text, folia_entitiesLayer_txt)        
            return doc
    except IOError:
        pass # Most likely a broken pipe
Beispiel #3
0
def build_text_structure(ann,txt_file_path):
    '''
    Will split a text file in paragraphs, sentences and words and return the folia document
    For every word it will check 2 main things:
    1) is the word part of some entities? and if so it will add them to a list of lists of words
    2) is their an entity that ends with this word? if so it will create the entity with the right words out of the list and delete this element after
    it took the words out.
    After every sentence, paragraph all the entities that started and ended within that structure will be added into the EntityLayer

    '''
    from annotation import open_textfile
    from tokenise import gtb_token_boundary_gen
    def add_list_entities(struct, folia_entities):
    #will check if any entities have to be added and add if needed
        if folia_entities:
            layer = struct.append(folia.EntitiesLayer)
            for folia_entity in folia_entities:

                layer.append(folia_entity)
                for attr in attributes[folia_entity.id]:
                    folia_entity.append(folia.Feature(doc,subset=attr.type, cls=str(attr.value)))

    try:
            #Sort entities on offset instead of id
            entities = sorted(ann.get_textbounds(), key=lambda entity: (entity.start, -entity.end))
            index = 0
            doc = folia.Document(id='brat')

            attributes = build_entities_attr(ann)

            folia_text = doc.append(folia.Text)
            paragraph = folia_text.append(folia.Paragraph)
            folia_sentence = 0
            par_start = 0
            #fictive sets
            doc.annotationdefaults[folia.AnnotationType.ENTITY] = {"entiteit_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.ENTITY, "entiteit_set.xml" ) )
            doc.annotationdefaults[folia.AnnotationType.MORPHOLOGICAL] = {"morph_set.xml": {} }
            doc.annotations.append( (folia.AnnotationType.MORPHOLOGICAL, "morph_set.xml" ) )

            entity = entities[index]
            entities_words=[]
            inner_index=0
            entities_words.append([])

            folia_entitiesLayer_par=[]
            folia_entitiesLayer_sen=[]
            folia_entitiesLayer_txt=[]


            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
            offsets = [o for o in regex_sentence_boundary_gen(text)]
            for start, end, sentence in _text_by_offsets_gen(text, offsets):
                if start == end and text[start-1] == '\n':
                    add_list_entities(paragraph, folia_entitiesLayer_par)
                    folia_entitiesLayer_par = []
                    paragraph = folia_text.append(folia.Paragraph)
                    par_start = start
                elif sentence != "" :
                    add_list_entities(folia_sentence, folia_entitiesLayer_sen)
                    folia_entitiesLayer_sen = []
                    folia_sentence = paragraph.append(folia.Sentence,sentence)
                offsetsw = [o for o in gtb_token_boundary_gen(sentence)]
                for tok in _text_by_offsets_gen(sentence, offsetsw):
                    entity = entities[index]
                    inner_index=0
                    folia_word = folia_sentence.append(folia.Word, tok[2])
                    morph_layer= ""
                    #check if word is part of the entity and if so remember folia word
                    while entity.start <= entities[index].end :
                        while( len(entities_words) <= inner_index ):
                                entities_words.append([])
                        for span_start, span_end in entity.spans:
                            if ( span_start <= tok[0]+start and tok[1]+start <= span_end):
                                entities_words[inner_index].append(doc[folia_word.id])
                            #entity ends within the word
                            elif (tok[1]+start >= span_end and span_end > tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                if offset_start <0 :# entity started before this word
                                    offset_start =0;
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                            #entity starts within the word
                            elif (tok[1]+start > span_start and span_start >= tok[0]+start) :
                                offset_start = span_start-(start+tok[0])
                                offset_end = span_end-(start+tok[0])
                                string = tok[2][offset_start:offset_end]
                                if not morph_layer:
                                    morph_layer = folia_word.append(folia.MorphologyLayer)
                                morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word))
                                morph.append(folia.TextContent(doc, value=string, offset=offset_start))
                                entities_words[inner_index].append(doc[morph.id])
                        inner_index = inner_index + 1
                        if len(entities) > index + inner_index :
                            entity = entities[index+inner_index]
                        else:
                            break
                    entity = entities[index]
                    inner_index = 0
                    #check for end of an entity and append entity to either text, paragraph or sentece depending on start of the entity
                    current_index = index
                    while entity.start <= entities[current_index].end :
                        if entity.end <= start + tok[1] and entity.start <= start + tok[0] :
                            if (entity.start >= start):
                                folia_entitiesLayer = folia_entitiesLayer_sen
                            elif (entity.start >= par_start):
                                folia_entitiesLayer = folia_entitiesLayer_par
                            else:
                                folia_entitiesLayer = folia_entitiesLayer_txt
                            if entities_words[inner_index]:
                                folia_entity = folia.Entity(doc, cls=entity.type, id=entity.id , contents=entities_words[inner_index])
                                folia_entitiesLayer.append(folia_entity)
                            elif not any(x.id == entity.id for x in folia_entitiesLayer):
                                #see if entity is already added
                                try:
                                    doc[entity.id]
                                except KeyError:
                                    raise EntityNotFoundError(entity)
                            if(inner_index == 0):
                                entities_words.pop(0)
                                if len(entities) > index+1 :
                                    index = index + 1
                                    for i in range(0, len(entities_words)):
                                        if(not entities_words[0]):
                                            entities_words.pop(0)
                                            index = index + 1
                                else:
                                    break

                            elif(inner_index > 0):
                                entities_words[inner_index]=[]
                                inner_index = inner_index + 1
                        else:
                            inner_index = inner_index + 1
                        if len(entities) > index + inner_index:
                            entity = entities[index+inner_index]
                        else:
                            break
            add_list_entities(paragraph, folia_entitiesLayer_par)
            add_list_entities(folia_sentence, folia_entitiesLayer_sen)
            add_list_entities(folia_text, folia_entitiesLayer_txt)
            return doc
    except IOError:
        pass # Most likely a broken pipe