コード例 #1
0
def _add_annotation(annotations, text_value, annotation_type, text, offset):
    if text is None:
        return offset
    prefix = None
    if annotation_type in ('Title', 'Abstract'):
        prefix = annotation_type.upper()
    if prefix is not None:
        anno = {
            "id": IdentifierFactory.next_id('Header'),
            "@type": vocab('Header'),
            "start": offset,
            "end": offset + len(prefix)
        }
        annotations.append(Annotation(anno))
        text_value.write(prefix + u"\n\n")
        offset += len(prefix) + 2
    anno = {
        "id": IdentifierFactory.next_id(annotation_type),
        "@type": vocab(annotation_type),
        "start": offset,
        "end": offset + len(text)
    }
    annotations.append(Annotation(anno))
    text_value.write(text + u"\n\n")
    return offset + len(text) + 2
コード例 #2
0
 def _add_technologies(self):
     """Takes the technology ontology and tries to add each element to the
     technologies index of this document. Add only if the technology term
     occurs in the text. This is done rather inefficiently by searching the
     entire text # for each technology, but on a 30K LIF document this takes
     less than # 0.01 seconds for 100 technology terms, so we can live with
     this."""
     technologies = self.annotations.technologies
     if technologies:
         next_id = max([int(a.id[1:])
                        for a in technologies.annotations]) + 1
         # print len(technologies.texts), len(technologies.annotations)
         for term in self.ontology.technologies:
             searchterm = r'\b%s\b' % term
             matches = list(
                 re.finditer(searchterm, self.annotations.text, flags=re.I))
             for match in matches:
                 json_obj = {
                     "id": "t%d" % next_id,
                     "@type": 'http://vocab.lappsgrid.org/Technology',
                     "start": match.start(),
                     "end": match.end()
                 }
                 next_id += 1
                 anno = Annotation(json_obj)
                 anno.text = term
                 technologies.add(anno)
コード例 #3
0
def markable_annotation(lif_obj):
    return Annotation({
        "id": "m1",
        "@type": 'http://vocab.lappsgrid.org/Markable',
        "start": 0,
        "end": len(lif_obj.text.value)
    })
コード例 #4
0
def topic_annotation(topic, topic_id, lemmas):
    return Annotation({"id": "t{:d}".format(topic_id),
                       "@type": 'http://vocab.lappsgrid.org/SemanticTag',
                       "target": "m1",
                       "features": {
                           "type": "gensim-topic",
                           "topic_id": topic[0],
                           "topic_score": "{:.04f}".format(topic[1]),
                           "topic_name": lemmas}})
コード例 #5
0
 def _add_docelement_anno(self, docelement_type, p1, p2):
     self.view.add(
         Annotation({
             'id': Identifiers.new_id('de'),
             '@type': 'Section',
             'start': p1,
             'end': p2,
             'features': {
                 'section_type': docelement_type
             }
         }))
コード例 #6
0
def _create_annotation(lif, tokens, w, term, i, length, ttype):
    p1, p2, w_in_text = _get_match_information(lif, tokens, i, length)
    if DEBUG:
        OUT.write("%s\t%s\t%s\n" % (p1, p2, w))
    next_id = TECHNOLOGIES.get_next_id()
    json_obj = { "id": "t%d" % next_id,
                 "@type": 'http://vocab.lappsgrid.org/Technology',
                 "start": p1, "end": p2,
                 "features": { "term": w, "type": ttype }}
    if w != term:
        json_obj['features']['term_normalized'] = term
    return Annotation(json_obj)
コード例 #7
0
 def as_annotation(self):
     properties = {
         "id": "p%s" % self.number,
         "@type": vocab('Page'),
         "start": self.start,
         "end": self.end,
         "features": {}
     }
     if self.header is not None:
         properties['features']['header'] = self.header
     if self.footer is not None:
         properties['features']['footer'] = self.footer
     return Annotation(properties)
コード例 #8
0
def generate_lif(txt, vnc):
    """
    * txt is a plain text file only with the original text value. 
    * vnc (verbnetclass) is a output from clearwsd file (mostly in conll format)
    This function will generate a LIF json file using disambiguation annotation 
    encoded in the vnc file, using txt as top-level `text` field. 
    """
    t = open(txt, encoding="utf-8")
    v = open(vnc, encoding="utf-8")
    lif_obj = LIF()
    cont_obj = Container()
    cont_obj.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif"
    cont_obj.payload = lif_obj

    raw_text = t.read()
    t.close()
    lif_obj.text.value = raw_text

    vnc_view = View()
    lif_obj.views.append(vnc_view)
    vnc_view.id = "verbnettag"
    vnc_view.metadata['contains'] = {vocab('SemanticTag'): {}}

    annotations = [line for line in v if line.startswith('#')]
    v.close()
    for annotation in annotations:
        splitted = annotation.split('\t')[0].split()

        oid = splitted[1]
        osent = splitted[2]
        otoken = splitted[3]
        olemma = " ".join(splitted[4:-1])  # some lemmas have space inside
        olabel = splitted[-1]
        properly_annotated = re.match(r'\d+\[(\d+),(\d+)\]', otoken)
        if properly_annotated is None:
            continue
        s, e = map(int, properly_annotated.groups())
        ann = {}
        ann["id"] = "vnc_" + oid
        ann["start"] = s
        ann["end"] = e
        ann["@type"] = vocab("SemanticTag")
        ann["features"] = {
            "tags": [olabel],
            "type": "VerbNetClass",
            "lemma": olemma,
            "text": raw_text[s:e]
        }
        ann_obj = Annotation(ann)
        vnc_view.annotations.append(ann_obj)
    cont_obj.write()
コード例 #9
0
def fix_view(identifier, view):
    annos = view.id['annotations']
    view.id = identifier
    view.annotations = []
    for a in annos:
        view.annotations.append(Annotation(a))