def _add_view(lif_obj, json_obj): view = View() lif_obj.views.append(view) view.id = "structure" view.metadata['contains'] = { vocab("Title"): {}, vocab("Abstract"): {}, vocab("Section"): {}, vocab("Header"): {} }
def generate_lif(txt, vnc): """ * txt is a plain text file only with the original text value. * vnc (verbnetclass) is a output from clearwsd file (mostly in conll format) This function will generate a LIF json file using disambiguation annotation encoded in the vnc file, using txt as top-level `text` field. """ t = open(txt, encoding="utf-8") v = open(vnc, encoding="utf-8") lif_obj = LIF() cont_obj = Container() cont_obj.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif" cont_obj.payload = lif_obj raw_text = t.read() t.close() lif_obj.text.value = raw_text vnc_view = View() lif_obj.views.append(vnc_view) vnc_view.id = "verbnettag" vnc_view.metadata['contains'] = {vocab('SemanticTag'): {}} annotations = [line for line in v if line.startswith('#')] v.close() for annotation in annotations: splitted = annotation.split('\t')[0].split() oid = splitted[1] osent = splitted[2] otoken = splitted[3] olemma = " ".join(splitted[4:-1]) # some lemmas have space inside olabel = splitted[-1] properly_annotated = re.match(r'\d+\[(\d+),(\d+)\]', otoken) if properly_annotated is None: continue s, e = map(int, properly_annotated.groups()) ann = {} ann["id"] = "vnc_" + oid ann["start"] = s ann["end"] = e ann["@type"] = vocab("SemanticTag") ann["features"] = { "tags": [olabel], "type": "VerbNetClass", "lemma": olemma, "text": raw_text[s:e] } ann_obj = Annotation(ann) vnc_view.annotations.append(ann_obj) cont_obj.write()
def _create_view(): view_spec = { 'id': "sentences", 'metadata': { 'contains': { 'http://vocab.lappsgrid.org/Sentence': { 'producer': 'generate_sentence_type.py'}}}, 'annotations': []} return View(json_obj=view_spec)
def _create_view(): view_spec = { 'id': "topics", 'metadata': { 'contains': { 'http://vocab.lappsgrid.org/Markable': { 'producer': 'generate_topics.py'}, 'http://vocab.lappsgrid.org/SemanticTag': { 'producer': 'generate_topics.py'}}}, 'annotations': []} return View(json_obj=view_spec)
def create_view(identifier, tag, producer): vocab_url = 'http://vocab.lappsgrid.org/%s' % tag view_spec = { 'id': identifier, 'metadata': { 'contains': { vocab_url: { 'producer': producer } } }, 'annotations': [] } return View(json_obj=view_spec)
def create_page_view(): view = View() view.id = "pages" view.metadata['contains'] = {vocab("Page"): {}} return view
def _setup(self): Identifiers.reset() self.p = 0 self.lif = LIF() self.text = StringIO() self.view = View('docstruct')
class Converter(object): """Converts the JSON from a COVID file into a LIF document.""" # TODO: add the directory of the sourcefile to the metadata # TODO: (this is to destinguish between the licenses) def __init__(self, infile, outfile, metadata): self.infile = infile self.outfile = outfile self.doc = CovidDoc(self.infile, metadata) def convert(self): print('Converting', os.path.basename(self.infile)) if not self.doc.is_complete(): print('skipping') return with open(self.outfile, 'w') as fh: self._setup() self._collect_metadata() self._add_abstract() self._add_sections() self._finish() def _setup(self): Identifiers.reset() self.p = 0 self.lif = LIF() self.text = StringIO() self.view = View('docstruct') def _collect_metadata(self): self.lif.metadata['title'] = self.doc.title self.lif.metadata['sha'] = self.doc.id self.lif.metadata['pmid'] = self.doc.pmid self.lif.metadata['year'] = self.doc.year self.lif.metadata['authors'] = [] for author in self.doc.authors: fullname = "%s %s" % (author['first'], author['last']) # TODO: this is wrong, the test always succeeds if ' ' in fullname: # this filters out all the short single names including the deceased sign self.lif.metadata['authors'].append(fullname) def _add_docelement_anno(self, docelement_type, p1, p2): self.view.add( Annotation({ 'id': Identifiers.new_id('de'), '@type': 'Section', 'start': p1, 'end': p2, 'features': { 'section_type': docelement_type } })) def _add_abstract(self): # TODO: would like to add the section header # TODO: should make sure that the docelement ends not after the newlines abstract_p0 = self.p for text_str in self.doc.abstract: text_str += u"\n\n" chars = len(text_str) self.p += chars self.text.write(text_str) self._add_docelement_anno('Abstract', abstract_p0, self.p) def _add_sections(self): # TODO: add section header previous_header = None section_p0 = self.p for header_str, text_str in self.doc.body_text: text_str += u"\n\n" header_str += u"\n\n" chars = len(text_str) self.p += chars if header_str != previous_header: # fh.write(header_str) previous_header = header_str self.text.write(text_str) self._add_docelement_anno('Paragraph', section_p0, self.p) section_p0 = self.p def _finish(self): """Gather it all up and write output.""" self.lif.text = Text(json_obj={ 'language': 'en', '@value': self.text.getvalue() }) self.lif.views.append(self.view) self.lif.write(self.outfile, pretty=True)