def compile_chunks(sentence, column=-1, shift=0): entity_chunks = [] label = u"" start = -1 for index, token in enumerate(sentence): ne = token[column] if ne == "O": if label: entity_chunks.append(Tag(start+shift, index+shift, label)) label = u"" start = -1 elif ne[0] == "B": if label: entity_chunks.append(Tag(start+shift, index+shift, label)) start = index label = ne[2:] elif ne[0] == "I": None else: raise ValueError(ne) if label: entity_chunks.append(Tag(start+shift, index+shift, label)) label = u"" start = -1 return entity_chunks
def json_data(data): document = Document(data.get(u"name", u"_DOCUMENT_"), content=data.get(u"content", u"")) for key, value in data.get(u"metadatas", {}).items(): document.add_metadata(key, value) for segmentation_name in data.get(u"segmentations", {}): d = data[u"segmentations"][segmentation_name] spans = [ Span(lb=span[u"s"], ub=0, length=span[u"l"]) for span in d[u"spans"] ] segmentation = Segmentation(segmentation_name, spans=spans, reference=d.get(u"reference", None)) document.add_segmentation(segmentation) for segmentation in document.segmentations: if segmentation.reference is not None: segmentation.reference = document.segmentation( segmentation.reference) for annotation_name in data.get(u"annotations", {}): d = data[u"annotations"][annotation_name] annotations = [ Tag(lb=annotation[u"s"], ub=0, length=annotation[u"l"], value=annotation[u"v"]) for annotation in d[u"annotations"] ] annotation = Annotation(annotation_name, reference=document.segmentation( d[u"reference"]), annotations=annotations) document.add_annotation(annotation)
def brat_file(filename, encoding="utf-8"): no_ext, ext = os.path.splitext(filename) txt_file = no_ext + ".txt" ann_file = no_ext + ".ann" if not (os.path.exists(txt_file) and os.path.exists(ann_file)): raise ValueError("missing either .ann or .txt file") document = Document(os.path.basename(txt_file), encoding=encoding, mime_type="text/plain") document.content = codecs.open(txt_file, "rU", encoding).read().replace(u"\r", u"") annotations = Annotation("NER") for line in codecs.open(ann_file, "rU", encoding): line = line.strip() if line != u"" and line.startswith(u'T'): parts = line.split(u"\t") value, bounds = parts[1].split(" ", 1) for bound in bounds.split(";"): lb, ub = bound.split() lb = int(lb) ub = int(ub) annotations.append(Tag(lb=lb, ub=ub, value=value)) annotations.sort() document.add_annotation(annotations) return document
def make_data(indirname, default_shift=0, top_level=True): files = [f for f in sorted(os.listdir(indirname)) if f.endswith(".ann")] annotations = [] contents = [] shift = default_shift for filename in files: full_path = os.path.join(indirname, filename) document = brat_file(full_path) if top_level: annotations.extend([ Tag(a.value, a.lb + shift, a.ub + shift) for a in get_top_level(document.annotation("NER")) ]) else: annotations.extend([ Tag(a.value, a.lb + shift, a.ub + shift) for a in document.annotation("NER") ]) contents.append(document.content) shift += NUM_NEWLINES + len(document.content) return contents, annotations, shift
def process_document(self, document, **kwargs): """ Updates a document with various segmentations and creates an sem.corpus (CoNLL-formatted data) using field argument as index. Parameters ---------- document : sem.storage.Document the input data. It is a document with only a content log_level : str or int the logging level log_file : str if not None, the file to log to (does not remove command-line logging). """ start = time.time() if self._log_file is not None: map_annotations_logger.addHandler(file_handler(self._log_file)) map_annotations_logger.setLevel(self._log_level) ref_annotation = document.annotation(self._annotation_name) ref_annotations = ref_annotation.annotations values = set([a.value for a in ref_annotations]) new_annotations = [ Tag(annotation.lb, annotation.ub, self._mapping.get(annotation.value, annotation.value)) for annotation in ref_annotations if self._mapping.get(annotation.value, None) != u"" ] document.add_annotation( Annotation(self._annotation_name, reference=ref_annotation.reference, annotations=new_annotations)) laps = time.time() - start map_annotations_logger.info('in %s' % (timedelta(seconds=laps)))
def gate_data(data, name=None): document = Document(name or "__DOCUMENT__", mime_type="text/plain") textwithnodes = data.findall("TextWithNodes")[0] annotation_sets = data.findall("AnnotationSet") text_parts = [textwithnodes.text or u""] nodes = {} for node in list(textwithnodes): nodes[int(node.attrib["id"])] = sum([len(part) for part in text_parts]) text_parts.append(node.tail or u"") document.content = u"".join(text_parts) annotations = [] for annotation_set in annotation_sets: annotation_name = annotation_set.attrib["Name"] sem_annotation = Annotation(annotation_name) for annotation in annotation_set: lb = nodes[int(annotation.attrib["StartNode"])] ub = nodes[int(annotation.attrib["EndNode"])] sem_annotation.append(Tag(lb, ub, annotation.attrib["Type"])) document.add_annotation(sem_annotation) return document
def detect_abbreviations(document, field): content = document.content word_spans = document.segmentation("tokens") if document.segmentation("sentences") is not None: sentence_spans = document.segmentation("sentences").spans sentence_spans_ref = document.segmentation( "sentences").get_reference_spans() else: sentence_spans_ref = [Span(0, len(document.content))] tokens = [content[span.lb:span.ub] for span in word_spans] annotations = document.annotation(field).get_reference_annotations() counts = {} positions = {} for i, token in enumerate(tokens): if abbrev_candidate(token) and len(token) > 1 and not ( (i > 1 and abbrev_candidate(tokens[i - 1])) or (i < len(tokens) - 1 and abbrev_candidate(tokens[i + 1]))): if token not in counts: counts[token] = 0 positions[token] = [] counts[token] += 1 positions[token].append(i) position2sentence = {} for token, indices in positions.items(): for index in indices: for i, span in enumerate(sentence_spans): if span.lb <= index and span.ub >= index: position2sentence[index] = sentence_spans_ref[i] reg2type = {} for key, val in counts.items(): all_solutions = [] for position in positions[key]: span = position2sentence[position] word_span = word_spans[position] lb = span.lb ub = word_span.lb solutions = longest_common_substring(content[lb:ub], tokens[position], casesensitive=False) if solutions == []: solutions = longest_common_substring(normalize(content[lb:ub]), tokens[position], casesensitive=False) solutions = [ solution for solution in solutions if len(solution) == len(tokens[position]) ] if len(solutions) > 0: all_solutions.extend([[(x + lb, y + lb) for (x, y) in solution] for solution in solutions]) if len(all_solutions) > 0: all_solutions.sort(key=lambda x: x[-1][0] - x[0][0]) best_solution = all_solutions[0] lo = best_solution[0][0] hi = best_solution[-1][0] lo_tokens = [ tok for tok in word_spans if tok.lb <= lo and tok.ub > lo ] hi_tokens = [ tok for tok in word_spans if tok.lb <= hi and tok.ub > hi ] abbrev_annots = [] for position in positions[key]: span = word_spans[position] abbrev_annots.extend([ annotation for annotation in annotations if annotation.lb == span.lb and annotation.ub == span.ub ]) try: toks = tokens_from_bounds(document, lo_tokens[0].lb, hi_tokens[0].ub) reg = tokens2regex(toks, re.U + re.I) for match in reg.finditer(content): annots = [ annotation for annotation in annotations if ((annotation.lb <= match.start() and match.start() <= annotation.ub) or ( annotation.lb <= match.end() and match.end() <= annotation.ub)) ] if len(annots) > 0: annot = annots[0] new_toks = tokens_from_bounds( document, min(annot.lb, match.start()), max(annot.ub, match.end())) new_reg = tokens2regex(new_toks, re.U + re.I) if new_reg.pattern not in reg2type: reg2type[new_reg.pattern] = [] reg2type[new_reg.pattern].append(annots[0].value) if abbrev_annots == []: abbrev_reg = tokens2regex([key], re.U) if abbrev_reg.pattern not in reg2type: reg2type[abbrev_reg.pattern] = [] reg2type[abbrev_reg.pattern].append( annots[0].value) if len(abbrev_annots) > 0: tag = abbrev_annots[0] new_reg = tokens2regex(toks, re.U + re.I) if new_reg.pattern not in reg2type: reg2type[new_reg.pattern] = [] reg2type[new_reg.pattern].append(tag.value) except IndexError: pass new_tags = [] for v in reg2type.keys(): type_counts = sorted([(the_type, reg2type[v].count(the_type)) for the_type in set(reg2type[v])], key=lambda x: (-x[-1], x[0])) fav_type = type_counts[0][0] regexp = re.compile(v, re.U + re.I * (u" " in v)) for match in regexp.finditer(content): lo_tok = word_spans.spans.index( [t for t in word_spans if t.lb == match.start()][0]) hi_tok = word_spans.spans.index( [t for t in word_spans if t.ub == match.end()][0]) + 1 new_tags.append(Tag(lo_tok, hi_tok, fav_type)) to_remove_tags = [] for new_tag in new_tags: to_remove_tags.extend([ ann for ann in document.annotation(field) if new_tag.lb <= ann.lb and ann.ub <= new_tag.ub and ann.value == new_tag.value ]) for to_remove_tag in to_remove_tags: try: document.annotation(field)._annotations.remove(to_remove_tag) except ValueError: pass all_tags = [[token[field] for token in sent] for sent in document.corpus.sentences] new_tags.sort(key=lambda x: (x.lb, -x.ub)) for new_tag in new_tags: nth_word = 0 nth_sent = 0 sents = document.corpus.sentences while nth_word + len(sents[nth_sent]) - 1 < new_tag.lb: nth_word += len(sents[nth_sent]) nth_sent += 1 start = new_tag.lb - nth_word end = new_tag.ub - nth_word document.corpus.sentences[nth_sent][start][ field] = u"B-%s" % new_tag.value all_tags[nth_sent][start] = u"B-%s" % new_tag.value for index in range(start + 1, end): document.corpus.sentences[nth_sent][index][ field] = u"I-%s" % new_tag.value all_tags[nth_sent][index] = u"I-%s" % new_tag.value document.add_annotation_from_tags(all_tags, field, field)
def __call__(self, list2dict, token_entry=None, annot_entry=None, *args, **kwargs): l = [u"O" for _ in range(len(list2dict))] form2entity = self._form2entity tmp = self._value._data length = len(list2dict) fst = 0 lst = -1 # last match found cur = 0 entry = (token_entry if token_entry is not None else self._entry) ckey = None # Current KEY entities = [] while fst < length - 1: cont = True while cont and (cur < length): ckey = list2dict[cur][entry] if l[cur] == "O": if NUL in tmp: lst = cur tmp = tmp.get(ckey, {}) cont = len(tmp) != 0 cur += int(cont) else: cont = False if NUL in tmp: lst = cur if lst != -1: form = u" ".join([list2dict[i][entry] for i in range(fst, lst)]) entities.append(Tag(form2entity[form], fst, lst)) fst = lst cur = fst else: fst += 1 cur = fst tmp = self._value._data lst = -1 if NUL in self._value._data.get(list2dict[-1][entry], []): entities.append(Tag(form2entity[list2dict[-1][entry]], len(list2dict)-1, len(list2dict))) ne_entry = (annot_entry if annot_entry is not None else self._ne_entry) gold = chunk_annotation_from_sentence(list2dict, ne_entry).annotations for i in reversed(range(len(entities))): e = entities[i] for r in gold: if (r.lb == e.lb and r.ub == e.ub): del entities[i] break for i in reversed(range(len(gold))): r = gold[i] for e in entities: if (r.lb >= e.lb and r.ub <= e.ub): del gold[i] break for r in gold + entities: appendice = u"-" + r.value l[r.lb] = u"B" + appendice for i in range(r.lb+1,r.ub): l[i] = u"I" + appendice return l