def load(filename): """Construct a new Story on the basis of some input file.""" filepath, _ = os.path.splitext(filename) id = os.path.basename(filepath) characters, locations, entities = read_annotation_file( filepath + '.ann') characters = [Entity(i, character) for i, character in enumerate(characters)] locations = [Entity(i, location) for i, location in enumerate(locations)] scenes = [] with codecs.open(filepath + '.txt', encoding='utf-8') as infile: for start, end in regex_sentence_boundary_gen(infile.read()): scenes.append(Scene(start, end)) for scene in scenes: for character in characters: for mention, _, _, _ in character.chain: if (entities[mention].start >= scene.start and entities[mention].end <= scene.end): scene.characters.add(character) for location in locations: for mention, _, _, _ in location.chain: if (entities[mention].start >= scene.start and entities[mention].end <= scene.end): scene.locations.add(location) return Story(id, filepath, scenes, set(characters), set(locations))
def generate_sentence_boundaries(doc): offsets = [] for start_offset, end_offset in ssplit.regex_sentence_boundary_gen(doc): # Skip empty lines if doc[start_offset:end_offset].strip(): offsets.append((start_offset, end_offset)) return offsets
def sentencebreaks_to_newlines(text): junk_mark = False offsets = [o for o in regex_sentence_boundary_gen(text)] # break into sentences sentences = [s for s in _text_by_offsets_gen(text, offsets)] # join up, adding a newline for space where possible orig_parts = [] new_parts = [] sentnum = len(sentences) for i in range(sentnum): sent = sentences[i] orig_parts.append(sent) new_parts.append(sent) if i < sentnum - 1: orig_parts.append(text[offsets[i][1]:offsets[i + 1][0]]) if (offsets[i][1] < offsets[i + 1][0] and text[offsets[i][1]].isspace()): # intervening space; can add newline new_parts.append( '\n' + text[offsets[i][1] + 1:offsets[i + 1][0]]) else: new_parts.append(text[offsets[i][1]:offsets[i + 1][0]]) if len(offsets) and offsets[-1][1] < len(text): orig_parts.append(text[offsets[-1][1]:]) new_parts.append(text[offsets[-1][1]:]) # sanity check if text != ''.join(orig_parts): print("INTERNAL ERROR:\n '%s'\nvs\n '%s'" % (text, ''.join(orig_parts))) junk_mark = True splittext = ''.join(new_parts) # sanity if len(text) != len(splittext): print("INTERNAL ERROR") junk_mark = True if _normspace(text) != _normspace(splittext): print("INTERNAL ERROR:\n '%s'\nvs\n '%s'" % (_normspace(text), _normspace(splittext))) junk_mark = True return splittext, junk_mark
def sentencebreaks_to_newlines(text): line_offset = 1 if "\r\n" in text: line_offset = 2 offsets = [o for o in regex_sentence_boundary_gen(text)] # break into sentences sentences = [s for s in _text_by_offsets_gen(text, offsets)] # join up, adding a newline for space where possible orig_parts = [] new_parts = [] sentnum = len(sentences) for i in range(sentnum): sent = sentences[i] orig_parts.append(sent) new_parts.append(sent) if i < sentnum - 1: orig_parts.append(text[offsets[i][1]:offsets[i + 1][0]]) if (offsets[i][1] < offsets[i + 1][0] and text[offsets[i][1]].isspace()): # intervening space; can add newline new_parts.append('\n' + text[offsets[i][1] + line_offset:offsets[i + 1][0]]) else: new_parts.append(text[offsets[i][1]:offsets[i + 1][0]]) if len(offsets) and offsets[-1][1] < len(text): orig_parts.append(text[offsets[-1][1]:]) new_parts.append(text[offsets[-1][1]:]) # sanity check assert text == ''.join( orig_parts), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % ( text, ''.join(orig_parts)) splittext = ''.join(new_parts) # sanity assert len(text) == len(splittext), "INTERNAL ERROR" assert _normspace(text) == _normspace( splittext), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % ( _normspace(text), _normspace(splittext)) return splittext
def generate_sentence_boundaries(doc): offsets = [] for start_offset, end_offset in ssplit.regex_sentence_boundary_gen(doc): # Skip empty lines if doc[start_offset:end_offset].strip(): while doc[start_offset] == " ": start_offset += 1 while doc[end_offset - 1] == " ": end_offset -= 1 assert start_offset < end_offset offsets.append((start_offset, end_offset)) return offsets
def sentencebreaks_to_newlines(text): offsets = [o for o in regex_sentence_boundary_gen(text)] # break into sentences sentences = [s for s in _text_by_offsets_gen(text, offsets)] # join up, adding a newline for space where possible orig_parts = [] new_parts = [] sentnum = len(sentences) for i in range(sentnum): sent = sentences[i] orig_parts.append(sent) new_parts.append(sent) if i < sentnum - 1: orig_parts.append(text[offsets[i][1]:offsets[i + 1][0]]) if (offsets[i][1] < offsets[i + 1][0] and text[offsets[i][1]].isspace()): # intervening space; can add newline new_parts.append( '\n' + text[offsets[i][1] + 1:offsets[i + 1][0]]) else: new_parts.append(text[offsets[i][1]:offsets[i + 1][0]]) if len(offsets) and offsets[-1][1] < len(text): orig_parts.append(text[offsets[-1][1]:]) new_parts.append(text[offsets[-1][1]:]) # sanity check assert text == ''.join(orig_parts), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % ( text, ''.join(orig_parts)) splittext = ''.join(new_parts) # sanity assert len(text) == len(splittext), "INTERNAL ERROR" assert _normspace(text) == _normspace(splittext), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % ( _normspace(text), _normspace(splittext)) return splittext
def build_text_structure(ann,txt_file_path): ''' Will split a text file in paragraphs, sentences and words and return the folia document For every word it will check 2 main things: 1) is the word part of some entities? and if so it will add them to a list of lists of words 2) is their an entity that ends with this word? if so it will create the entity with the right words out of the list and delete this element after it took the words out. After every sentence, paragraph all the entities that started and ended within that structure will be added into the EntityLayer ''' from annotation import open_textfile from tokenise import gtb_token_boundary_gen def add_list_entities(struct, folia_entities): #will check if any entities have to be added and add if needed if folia_entities: layer = struct.append(folia.EntitiesLayer) for folia_entity in folia_entities: layer.append(folia_entity) for attr in attributes[folia_entity.id]: folia_entity.append(folia.Feature(doc,subset=attr.type, cls=str(attr.value))) try: #Sort entities on offset instead of id entities = sorted(ann.get_textbounds(), key=lambda entity: (entity.start, -entity.end)) index = 0 doc = folia.Document(id='brat') attributes = build_entities_attr(ann) folia_text = doc.append(folia.Text) paragraph = folia_text.append(folia.Paragraph) folia_sentence = 0 par_start = 0 #fictive sets doc.annotationdefaults[folia.AnnotationType.ENTITY] = {"entiteit_set.xml": {} } doc.annotations.append( (folia.AnnotationType.ENTITY, "entiteit_set.xml" ) ) doc.annotationdefaults[folia.AnnotationType.MORPHOLOGICAL] = {"morph_set.xml": {} } doc.annotations.append( (folia.AnnotationType.MORPHOLOGICAL, "morph_set.xml" ) ) entity = entities[index] entities_words=[] inner_index=0 entities_words.append([]) folia_entitiesLayer_par=[] folia_entitiesLayer_sen=[] folia_entitiesLayer_txt=[] with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() offsets = [o for o in regex_sentence_boundary_gen(text)] for start, end, sentence in _text_by_offsets_gen(text, offsets): if start == end and text[start-1] == '\n': add_list_entities(paragraph, folia_entitiesLayer_par) folia_entitiesLayer_par = [] paragraph = folia_text.append(folia.Paragraph) par_start = start elif sentence != "" : add_list_entities(folia_sentence, folia_entitiesLayer_sen) folia_entitiesLayer_sen = [] folia_sentence = paragraph.append(folia.Sentence,sentence) offsetsw = [o for o in gtb_token_boundary_gen(sentence)] for tok in _text_by_offsets_gen(sentence, offsetsw): entity = entities[index] inner_index=0 folia_word = folia_sentence.append(folia.Word, tok[2]) morph_layer= "" #check if word is part of the entity and if so remember folia word while entity.start <= entities[index].end : while( len(entities_words) <= inner_index ): entities_words.append([]) for span_start, span_end in entity.spans: if ( span_start <= tok[0]+start and tok[1]+start <= span_end): entities_words[inner_index].append(doc[folia_word.id]) #entity ends within the word elif (tok[1]+start >= span_end and span_end > tok[0]+start) : offset_start = span_start-(start+tok[0]) if offset_start <0 :# entity started before this word offset_start =0; offset_end = span_end-(start+tok[0]) string = tok[2][offset_start:offset_end] if not morph_layer: morph_layer = folia_word.append(folia.MorphologyLayer) morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word)) morph.append(folia.TextContent(doc, value=string, offset=offset_start)) entities_words[inner_index].append(doc[morph.id]) #entity starts within the word elif (tok[1]+start > span_start and span_start >= tok[0]+start) : offset_start = span_start-(start+tok[0]) offset_end = span_end-(start+tok[0]) string = tok[2][offset_start:offset_end] if not morph_layer: morph_layer = folia_word.append(folia.MorphologyLayer) morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word)) morph.append(folia.TextContent(doc, value=string, offset=offset_start)) entities_words[inner_index].append(doc[morph.id]) inner_index = inner_index + 1 if len(entities) > index + inner_index : entity = entities[index+inner_index] else: break entity = entities[index] inner_index = 0 #check for end of an entity and append entity to either text, paragraph or sentece depending on start of the entity current_index = index while entity.start <= entities[current_index].end : if entity.end <= start + tok[1] and entity.start <= start + tok[0] : if (entity.start >= start): folia_entitiesLayer = folia_entitiesLayer_sen elif (entity.start >= par_start): folia_entitiesLayer = folia_entitiesLayer_par else: folia_entitiesLayer = folia_entitiesLayer_txt if entities_words[inner_index]: folia_entity = folia.Entity(doc, cls=entity.type, id=entity.id , contents=entities_words[inner_index]) folia_entitiesLayer.append(folia_entity) elif not any(x.id == entity.id for x in folia_entitiesLayer): #see if entity is already added try: doc[entity.id] except KeyError: raise EntityNotFoundError(entity) if(inner_index == 0): entities_words.pop(0) if len(entities) > index+1 : index = index + 1 for i in range(0, len(entities_words)): if(not entities_words[0]): entities_words.pop(0) index = index + 1 else: break elif(inner_index > 0): entities_words[inner_index]=[] inner_index = inner_index + 1 else: inner_index = inner_index + 1 if len(entities) > index + inner_index: entity = entities[index+inner_index] else: break add_list_entities(paragraph, folia_entitiesLayer_par) add_list_entities(folia_sentence, folia_entitiesLayer_sen) add_list_entities(folia_text, folia_entitiesLayer_txt) return doc except IOError: pass # Most likely a broken pipe