Example #1
0
def compile_chunks(sentence, column=-1, shift=0):
    entity_chunks = []
    label = u""
    start = -1
    for index, token in enumerate(sentence):
        ne = token[column]
        
        if ne == "O":
            if label:
                entity_chunks.append(Tag(start+shift, index+shift, label))
                label = u""
                start = -1
        elif ne[0] == "B":
            if label:
                entity_chunks.append(Tag(start+shift, index+shift, label))
            start = index
            label = ne[2:]
        elif ne[0] == "I":
            None
        else:
            raise ValueError(ne)
    if label:
        entity_chunks.append(Tag(start+shift, index+shift, label))
        label = u""
        start = -1
    
    return entity_chunks
Example #2
0
def json_data(data):
    document = Document(data.get(u"name", u"_DOCUMENT_"),
                        content=data.get(u"content", u""))
    for key, value in data.get(u"metadatas", {}).items():
        document.add_metadata(key, value)

    for segmentation_name in data.get(u"segmentations", {}):
        d = data[u"segmentations"][segmentation_name]
        spans = [
            Span(lb=span[u"s"], ub=0, length=span[u"l"])
            for span in d[u"spans"]
        ]
        segmentation = Segmentation(segmentation_name,
                                    spans=spans,
                                    reference=d.get(u"reference", None))
        document.add_segmentation(segmentation)
    for segmentation in document.segmentations:
        if segmentation.reference is not None:
            segmentation.reference = document.segmentation(
                segmentation.reference)

    for annotation_name in data.get(u"annotations", {}):
        d = data[u"annotations"][annotation_name]
        annotations = [
            Tag(lb=annotation[u"s"],
                ub=0,
                length=annotation[u"l"],
                value=annotation[u"v"]) for annotation in d[u"annotations"]
        ]
        annotation = Annotation(annotation_name,
                                reference=document.segmentation(
                                    d[u"reference"]),
                                annotations=annotations)
        document.add_annotation(annotation)
Example #3
0
def brat_file(filename, encoding="utf-8"):
    no_ext, ext = os.path.splitext(filename)
    txt_file = no_ext + ".txt"
    ann_file = no_ext + ".ann"
    if not (os.path.exists(txt_file) and os.path.exists(ann_file)):
        raise ValueError("missing either .ann or .txt file")

    document = Document(os.path.basename(txt_file),
                        encoding=encoding,
                        mime_type="text/plain")
    document.content = codecs.open(txt_file, "rU",
                                   encoding).read().replace(u"\r", u"")
    annotations = Annotation("NER")
    for line in codecs.open(ann_file, "rU", encoding):
        line = line.strip()
        if line != u"" and line.startswith(u'T'):
            parts = line.split(u"\t")
            value, bounds = parts[1].split(" ", 1)
            for bound in bounds.split(";"):
                lb, ub = bound.split()
                lb = int(lb)
                ub = int(ub)
                annotations.append(Tag(lb=lb, ub=ub, value=value))
    annotations.sort()
    document.add_annotation(annotations)

    return document
Example #4
0
def make_data(indirname, default_shift=0, top_level=True):
    files = [f for f in sorted(os.listdir(indirname)) if f.endswith(".ann")]
    annotations = []
    contents = []
    shift = default_shift
    for filename in files:
        full_path = os.path.join(indirname, filename)
        document = brat_file(full_path)
        if top_level:
            annotations.extend([
                Tag(a.value, a.lb + shift, a.ub + shift)
                for a in get_top_level(document.annotation("NER"))
            ])
        else:
            annotations.extend([
                Tag(a.value, a.lb + shift, a.ub + shift)
                for a in document.annotation("NER")
            ])
        contents.append(document.content)
        shift += NUM_NEWLINES + len(document.content)

    return contents, annotations, shift
Example #5
0
    def process_document(self, document, **kwargs):
        """
        Updates a document with various segmentations and creates
        an sem.corpus (CoNLL-formatted data) using field argument as index.
        
        Parameters
        ----------
        document : sem.storage.Document
            the input data. It is a document with only a content
        log_level : str or int
            the logging level
        log_file : str
            if not None, the file to log to (does not remove command-line
            logging).
        """

        start = time.time()

        if self._log_file is not None:
            map_annotations_logger.addHandler(file_handler(self._log_file))
        map_annotations_logger.setLevel(self._log_level)

        ref_annotation = document.annotation(self._annotation_name)
        ref_annotations = ref_annotation.annotations
        values = set([a.value for a in ref_annotations])
        new_annotations = [
            Tag(annotation.lb, annotation.ub,
                self._mapping.get(annotation.value, annotation.value))
            for annotation in ref_annotations
            if self._mapping.get(annotation.value, None) != u""
        ]

        document.add_annotation(
            Annotation(self._annotation_name,
                       reference=ref_annotation.reference,
                       annotations=new_annotations))

        laps = time.time() - start
        map_annotations_logger.info('in %s' % (timedelta(seconds=laps)))
Example #6
0
def gate_data(data, name=None):
    document = Document(name or "__DOCUMENT__", mime_type="text/plain")

    textwithnodes = data.findall("TextWithNodes")[0]
    annotation_sets = data.findall("AnnotationSet")

    text_parts = [textwithnodes.text or u""]
    nodes = {}
    for node in list(textwithnodes):
        nodes[int(node.attrib["id"])] = sum([len(part) for part in text_parts])
        text_parts.append(node.tail or u"")
    document.content = u"".join(text_parts)

    annotations = []
    for annotation_set in annotation_sets:
        annotation_name = annotation_set.attrib["Name"]
        sem_annotation = Annotation(annotation_name)
        for annotation in annotation_set:
            lb = nodes[int(annotation.attrib["StartNode"])]
            ub = nodes[int(annotation.attrib["EndNode"])]
            sem_annotation.append(Tag(lb, ub, annotation.attrib["Type"]))
        document.add_annotation(sem_annotation)

    return document
Example #7
0
def detect_abbreviations(document, field):
    content = document.content
    word_spans = document.segmentation("tokens")
    if document.segmentation("sentences") is not None:
        sentence_spans = document.segmentation("sentences").spans
        sentence_spans_ref = document.segmentation(
            "sentences").get_reference_spans()
    else:
        sentence_spans_ref = [Span(0, len(document.content))]
    tokens = [content[span.lb:span.ub] for span in word_spans]
    annotations = document.annotation(field).get_reference_annotations()

    counts = {}
    positions = {}
    for i, token in enumerate(tokens):
        if abbrev_candidate(token) and len(token) > 1 and not (
            (i > 1 and abbrev_candidate(tokens[i - 1])) or
            (i < len(tokens) - 1 and abbrev_candidate(tokens[i + 1]))):
            if token not in counts:
                counts[token] = 0
                positions[token] = []
            counts[token] += 1
            positions[token].append(i)
    position2sentence = {}
    for token, indices in positions.items():
        for index in indices:
            for i, span in enumerate(sentence_spans):
                if span.lb <= index and span.ub >= index:
                    position2sentence[index] = sentence_spans_ref[i]

    reg2type = {}
    for key, val in counts.items():
        all_solutions = []
        for position in positions[key]:
            span = position2sentence[position]
            word_span = word_spans[position]
            lb = span.lb
            ub = word_span.lb
            solutions = longest_common_substring(content[lb:ub],
                                                 tokens[position],
                                                 casesensitive=False)
            if solutions == []:
                solutions = longest_common_substring(normalize(content[lb:ub]),
                                                     tokens[position],
                                                     casesensitive=False)
            solutions = [
                solution for solution in solutions
                if len(solution) == len(tokens[position])
            ]
            if len(solutions) > 0:
                all_solutions.extend([[(x + lb, y + lb) for (x, y) in solution]
                                      for solution in solutions])
        if len(all_solutions) > 0:
            all_solutions.sort(key=lambda x: x[-1][0] - x[0][0])
            best_solution = all_solutions[0]
            lo = best_solution[0][0]
            hi = best_solution[-1][0]
            lo_tokens = [
                tok for tok in word_spans if tok.lb <= lo and tok.ub > lo
            ]
            hi_tokens = [
                tok for tok in word_spans if tok.lb <= hi and tok.ub > hi
            ]
            abbrev_annots = []
            for position in positions[key]:
                span = word_spans[position]
                abbrev_annots.extend([
                    annotation for annotation in annotations
                    if annotation.lb == span.lb and annotation.ub == span.ub
                ])
            try:
                toks = tokens_from_bounds(document, lo_tokens[0].lb,
                                          hi_tokens[0].ub)
                reg = tokens2regex(toks, re.U + re.I)
                for match in reg.finditer(content):
                    annots = [
                        annotation for annotation in annotations
                        if ((annotation.lb <= match.start()
                             and match.start() <= annotation.ub) or (
                                 annotation.lb <= match.end()
                                 and match.end() <= annotation.ub))
                    ]
                    if len(annots) > 0:
                        annot = annots[0]
                        new_toks = tokens_from_bounds(
                            document, min(annot.lb, match.start()),
                            max(annot.ub, match.end()))
                        new_reg = tokens2regex(new_toks, re.U + re.I)
                        if new_reg.pattern not in reg2type:
                            reg2type[new_reg.pattern] = []
                        reg2type[new_reg.pattern].append(annots[0].value)
                        if abbrev_annots == []:
                            abbrev_reg = tokens2regex([key], re.U)
                            if abbrev_reg.pattern not in reg2type:
                                reg2type[abbrev_reg.pattern] = []
                            reg2type[abbrev_reg.pattern].append(
                                annots[0].value)
                if len(abbrev_annots) > 0:
                    tag = abbrev_annots[0]
                    new_reg = tokens2regex(toks, re.U + re.I)
                    if new_reg.pattern not in reg2type:
                        reg2type[new_reg.pattern] = []
                    reg2type[new_reg.pattern].append(tag.value)
            except IndexError:
                pass

    new_tags = []
    for v in reg2type.keys():
        type_counts = sorted([(the_type, reg2type[v].count(the_type))
                              for the_type in set(reg2type[v])],
                             key=lambda x: (-x[-1], x[0]))
        fav_type = type_counts[0][0]
        regexp = re.compile(v, re.U + re.I * (u" " in v))
        for match in regexp.finditer(content):
            lo_tok = word_spans.spans.index(
                [t for t in word_spans if t.lb == match.start()][0])
            hi_tok = word_spans.spans.index(
                [t for t in word_spans if t.ub == match.end()][0]) + 1
            new_tags.append(Tag(lo_tok, hi_tok, fav_type))

    to_remove_tags = []
    for new_tag in new_tags:
        to_remove_tags.extend([
            ann for ann in document.annotation(field) if new_tag.lb <= ann.lb
            and ann.ub <= new_tag.ub and ann.value == new_tag.value
        ])
    for to_remove_tag in to_remove_tags:
        try:
            document.annotation(field)._annotations.remove(to_remove_tag)
        except ValueError:
            pass

    all_tags = [[token[field] for token in sent]
                for sent in document.corpus.sentences]
    new_tags.sort(key=lambda x: (x.lb, -x.ub))
    for new_tag in new_tags:
        nth_word = 0
        nth_sent = 0
        sents = document.corpus.sentences
        while nth_word + len(sents[nth_sent]) - 1 < new_tag.lb:
            nth_word += len(sents[nth_sent])
            nth_sent += 1
        start = new_tag.lb - nth_word
        end = new_tag.ub - nth_word
        document.corpus.sentences[nth_sent][start][
            field] = u"B-%s" % new_tag.value
        all_tags[nth_sent][start] = u"B-%s" % new_tag.value
        for index in range(start + 1, end):
            document.corpus.sentences[nth_sent][index][
                field] = u"I-%s" % new_tag.value
            all_tags[nth_sent][index] = u"I-%s" % new_tag.value

    document.add_annotation_from_tags(all_tags, field, field)
Example #8
0
 def __call__(self, list2dict, token_entry=None, annot_entry=None, *args, **kwargs):
     l           = [u"O" for _ in range(len(list2dict))]
     form2entity = self._form2entity
     tmp         = self._value._data
     length      = len(list2dict)
     fst         = 0
     lst         = -1 # last match found
     cur         = 0
     entry       = (token_entry if token_entry is not None else self._entry)
     ckey        = None  # Current KEY
     entities    = []
     while fst < length - 1:
         cont = True
         while cont and (cur < length):
             ckey  = list2dict[cur][entry]
             if l[cur] == "O":
                 if NUL in tmp: lst = cur
                 tmp   = tmp.get(ckey, {})
                 cont  = len(tmp) != 0
                 cur  += int(cont)
             else:
                 cont = False
         
         if NUL in tmp: lst = cur
         
         if lst != -1:
             form = u" ".join([list2dict[i][entry] for i in range(fst, lst)])
             entities.append(Tag(form2entity[form], fst, lst))
             fst = lst
             cur = fst
         else:
             fst += 1
             cur  = fst
         
         tmp = self._value._data
         lst = -1
     
     if NUL in self._value._data.get(list2dict[-1][entry], []):
         entities.append(Tag(form2entity[list2dict[-1][entry]], len(list2dict)-1, len(list2dict)))
     
     ne_entry = (annot_entry if annot_entry is not None else self._ne_entry)
     gold = chunk_annotation_from_sentence(list2dict, ne_entry).annotations
     
     for i in reversed(range(len(entities))):
         e = entities[i]
         for r in gold:
             if (r.lb == e.lb and r.ub == e.ub):
                 del entities[i]
                 break
     
     for i in reversed(range(len(gold))):
         r = gold[i]
         for e in entities:
             if (r.lb >= e.lb and r.ub <= e.ub):
                 del gold[i]
                 break
     
     for r in gold + entities:
         appendice = u"-" + r.value
         l[r.lb] = u"B" + appendice
         for i in range(r.lb+1,r.ub):
             l[i] = u"I" + appendice
     
     return l