def _add_view(lif_obj, json_obj):
    view = View()
    lif_obj.views.append(view)
    view.id = "structure"
    view.metadata['contains'] = {
        vocab("Title"): {},
        vocab("Abstract"): {},
        vocab("Section"): {},
        vocab("Header"): {}
    }
Example #2
0
def generate_lif(txt, vnc):
    """
    * txt is a plain text file only with the original text value. 
    * vnc (verbnetclass) is a output from clearwsd file (mostly in conll format)
    This function will generate a LIF json file using disambiguation annotation 
    encoded in the vnc file, using txt as top-level `text` field. 
    """
    t = open(txt, encoding="utf-8")
    v = open(vnc, encoding="utf-8")
    lif_obj = LIF()
    cont_obj = Container()
    cont_obj.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif"
    cont_obj.payload = lif_obj

    raw_text = t.read()
    t.close()
    lif_obj.text.value = raw_text

    vnc_view = View()
    lif_obj.views.append(vnc_view)
    vnc_view.id = "verbnettag"
    vnc_view.metadata['contains'] = {vocab('SemanticTag'): {}}

    annotations = [line for line in v if line.startswith('#')]
    v.close()
    for annotation in annotations:
        splitted = annotation.split('\t')[0].split()

        oid = splitted[1]
        osent = splitted[2]
        otoken = splitted[3]
        olemma = " ".join(splitted[4:-1])  # some lemmas have space inside
        olabel = splitted[-1]
        properly_annotated = re.match(r'\d+\[(\d+),(\d+)\]', otoken)
        if properly_annotated is None:
            continue
        s, e = map(int, properly_annotated.groups())
        ann = {}
        ann["id"] = "vnc_" + oid
        ann["start"] = s
        ann["end"] = e
        ann["@type"] = vocab("SemanticTag")
        ann["features"] = {
            "tags": [olabel],
            "type": "VerbNetClass",
            "lemma": olemma,
            "text": raw_text[s:e]
        }
        ann_obj = Annotation(ann)
        vnc_view.annotations.append(ann_obj)
    cont_obj.write()
def _create_view():
    view_spec = {
        'id': "sentences",
        'metadata': {
            'contains': {
                'http://vocab.lappsgrid.org/Sentence': {
                    'producer': 'generate_sentence_type.py'}}},
        'annotations': []}
    return View(json_obj=view_spec)
def _create_view():
    view_spec = {
        'id': "topics",
        'metadata': {
            'contains': {
                'http://vocab.lappsgrid.org/Markable': {
                    'producer': 'generate_topics.py'},
                'http://vocab.lappsgrid.org/SemanticTag': {
                    'producer': 'generate_topics.py'}}},
        'annotations': []}
    return View(json_obj=view_spec)
Example #5
0
def create_view(identifier, tag, producer):
    vocab_url = 'http://vocab.lappsgrid.org/%s' % tag
    view_spec = {
        'id': identifier,
        'metadata': {
            'contains': {
                vocab_url: {
                    'producer': producer
                }
            }
        },
        'annotations': []
    }
    return View(json_obj=view_spec)
Example #6
0
def create_page_view():
    view = View()
    view.id = "pages"
    view.metadata['contains'] = {vocab("Page"): {}}
    return view
Example #7
0
 def _setup(self):
     Identifiers.reset()
     self.p = 0
     self.lif = LIF()
     self.text = StringIO()
     self.view = View('docstruct')
Example #8
0
class Converter(object):
    """Converts the JSON from a COVID file into a LIF document."""

    # TODO: add the directory of the sourcefile to the metadata
    # TODO: (this is to destinguish between the licenses)

    def __init__(self, infile, outfile, metadata):
        self.infile = infile
        self.outfile = outfile
        self.doc = CovidDoc(self.infile, metadata)

    def convert(self):
        print('Converting', os.path.basename(self.infile))
        if not self.doc.is_complete():
            print('skipping')
            return
        with open(self.outfile, 'w') as fh:
            self._setup()
            self._collect_metadata()
            self._add_abstract()
            self._add_sections()
            self._finish()

    def _setup(self):
        Identifiers.reset()
        self.p = 0
        self.lif = LIF()
        self.text = StringIO()
        self.view = View('docstruct')

    def _collect_metadata(self):
        self.lif.metadata['title'] = self.doc.title
        self.lif.metadata['sha'] = self.doc.id
        self.lif.metadata['pmid'] = self.doc.pmid
        self.lif.metadata['year'] = self.doc.year
        self.lif.metadata['authors'] = []
        for author in self.doc.authors:
            fullname = "%s %s" % (author['first'], author['last'])
            # TODO: this is wrong, the test always succeeds
            if ' ' in fullname:
                # this filters out all the short single names including the deceased sign
                self.lif.metadata['authors'].append(fullname)

    def _add_docelement_anno(self, docelement_type, p1, p2):
        self.view.add(
            Annotation({
                'id': Identifiers.new_id('de'),
                '@type': 'Section',
                'start': p1,
                'end': p2,
                'features': {
                    'section_type': docelement_type
                }
            }))

    def _add_abstract(self):
        # TODO: would like to add the section header
        # TODO: should make sure that the docelement ends not after the newlines
        abstract_p0 = self.p
        for text_str in self.doc.abstract:
            text_str += u"\n\n"
            chars = len(text_str)
            self.p += chars
            self.text.write(text_str)
        self._add_docelement_anno('Abstract', abstract_p0, self.p)

    def _add_sections(self):
        # TODO: add section header
        previous_header = None
        section_p0 = self.p
        for header_str, text_str in self.doc.body_text:
            text_str += u"\n\n"
            header_str += u"\n\n"
            chars = len(text_str)
            self.p += chars
            if header_str != previous_header:
                # fh.write(header_str)
                previous_header = header_str
            self.text.write(text_str)
            self._add_docelement_anno('Paragraph', section_p0, self.p)
            section_p0 = self.p

    def _finish(self):
        """Gather it all up and write output."""
        self.lif.text = Text(json_obj={
            'language': 'en',
            '@value': self.text.getvalue()
        })
        self.lif.views.append(self.view)
        self.lif.write(self.outfile, pretty=True)