def word_positions(filename): with codecs.open(filename) as infile: text = infile.read() return [o for o in gtb_token_boundary_gen(text)]
def build_text_structure(ann,txt_file_path): ''' Will split a text file in paragraphs, sentences and words and return the folia document For every word it will check 2 main things: 1) is the word part of some entities? and if so it will add them to a list of lists of words 2) is their an entity that ends with this word? if so it will create the entity with the right words out of the list and delete this element after it took the words out. After every sentence, paragraph all the entities that started and ended within that structure will be added into the EntityLayer ''' from annotation import open_textfile from tokenise import gtb_token_boundary_gen def add_list_entities(struct, folia_entities): #will check if any entities have to be added and add if needed if folia_entities: layer = struct.append(folia.EntitiesLayer) for folia_entity in folia_entities: layer.append(folia_entity) for attr in attributes[folia_entity.id]: folia_entity.append(folia.Feature(doc,subset=attr.type, cls=str(attr.value))) try: #Sort entities on offset instead of id entities = sorted(ann.get_textbounds(), key=lambda entity: (entity.start, -entity.end)) index = 0 doc = folia.Document(id='brat') attributes = build_entities_attr(ann) folia_text = doc.append(folia.Text) paragraph = folia_text.append(folia.Paragraph) folia_sentence = 0 par_start = 0 #fictive sets doc.annotationdefaults[folia.AnnotationType.ENTITY] = {"entiteit_set.xml": {} } doc.annotations.append( (folia.AnnotationType.ENTITY, "entiteit_set.xml" ) ) doc.annotationdefaults[folia.AnnotationType.MORPHOLOGICAL] = {"morph_set.xml": {} } doc.annotations.append( (folia.AnnotationType.MORPHOLOGICAL, "morph_set.xml" ) ) entity = entities[index] entities_words=[] inner_index=0 entities_words.append([]) folia_entitiesLayer_par=[] folia_entitiesLayer_sen=[] folia_entitiesLayer_txt=[] with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() offsets = [o for o in regex_sentence_boundary_gen(text)] for start, end, sentence in _text_by_offsets_gen(text, offsets): if start == end and text[start-1] == '\n': add_list_entities(paragraph, folia_entitiesLayer_par) folia_entitiesLayer_par = [] paragraph = folia_text.append(folia.Paragraph) par_start = start elif sentence != "" : add_list_entities(folia_sentence, folia_entitiesLayer_sen) folia_entitiesLayer_sen = [] folia_sentence = paragraph.append(folia.Sentence,sentence) offsetsw = [o for o in gtb_token_boundary_gen(sentence)] for tok in _text_by_offsets_gen(sentence, offsetsw): entity = entities[index] inner_index=0 folia_word = folia_sentence.append(folia.Word, tok[2]) morph_layer= "" #check if word is part of the entity and if so remember folia word while entity.start <= entities[index].end : while( len(entities_words) <= inner_index ): entities_words.append([]) for span_start, span_end in entity.spans: if ( span_start <= tok[0]+start and tok[1]+start <= span_end): entities_words[inner_index].append(doc[folia_word.id]) #entity ends within the word elif (tok[1]+start >= span_end and span_end > tok[0]+start) : offset_start = span_start-(start+tok[0]) if offset_start <0 :# entity started before this word offset_start =0; offset_end = span_end-(start+tok[0]) string = tok[2][offset_start:offset_end] if not morph_layer: morph_layer = folia_word.append(folia.MorphologyLayer) morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word)) morph.append(folia.TextContent(doc, value=string, offset=offset_start)) entities_words[inner_index].append(doc[morph.id]) #entity starts within the word elif (tok[1]+start > span_start and span_start >= tok[0]+start) : offset_start = span_start-(start+tok[0]) offset_end = span_end-(start+tok[0]) string = tok[2][offset_start:offset_end] if not morph_layer: morph_layer = folia_word.append(folia.MorphologyLayer) morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word)) morph.append(folia.TextContent(doc, value=string, offset=offset_start)) entities_words[inner_index].append(doc[morph.id]) inner_index = inner_index + 1 if len(entities) > index + inner_index : entity = entities[index+inner_index] else: break entity = entities[index] inner_index = 0 #check for end of an entity and append entity to either text, paragraph or sentece depending on start of the entity current_index = index while entity.start <= entities[current_index].end : if entity.end <= start + tok[1] and entity.start <= start + tok[0] : if (entity.start >= start): folia_entitiesLayer = folia_entitiesLayer_sen elif (entity.start >= par_start): folia_entitiesLayer = folia_entitiesLayer_par else: folia_entitiesLayer = folia_entitiesLayer_txt if entities_words[inner_index]: folia_entity = folia.Entity(doc, cls=entity.type, id=entity.id , contents=entities_words[inner_index]) folia_entitiesLayer.append(folia_entity) elif not any(x.id == entity.id for x in folia_entitiesLayer): #see if entity is already added try: doc[entity.id] except KeyError: raise EntityNotFoundError(entity) if(inner_index == 0): entities_words.pop(0) if len(entities) > index+1 : index = index + 1 for i in range(0, len(entities_words)): if(not entities_words[0]): entities_words.pop(0) index = index + 1 else: break elif(inner_index > 0): entities_words[inner_index]=[] inner_index = inner_index + 1 else: inner_index = inner_index + 1 if len(entities) > index + inner_index: entity = entities[index+inner_index] else: break add_list_entities(paragraph, folia_entitiesLayer_par) add_list_entities(folia_sentence, folia_entitiesLayer_sen) add_list_entities(folia_text, folia_entitiesLayer_txt) return doc except IOError: pass # Most likely a broken pipe