def generate_metadata(data_dir, fname): subdir = os.path.split(fname)[0] lif_file = os.path.join(data_dir, 'lif', subdir, "tesseract-300dpi-20p.lif") ner_file = os.path.join(data_dir, 'ner', subdir, "%s.ner.lif" % subdir) mta_file = os.path.join(data_dir, 'mta', subdir, "%s.mta.lif" % subdir) ensure_directory(mta_file) lif = Container(lif_file).payload lif_ner = Container(ner_file).payload lif_mta = LIF(json_object=lif.as_json()) lif_mta.text.value = None lif_mta.text.fname = lif_file lif_mta.views = [] lif.metadata["authors"] = [] lif.metadata["year"] = None page_view = lif.get_view("pages") ner_view = lif_ner.get_view('v2') window = _get_window(page_view) lif.metadata["authors"] = _get_authors(lif, ner_view, window) lif.metadata["year"] = _get_year(ner_view, window) lif_mta.write(fname=mta_file, pretty=True)
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary): topic_id = 0 #fname_in = os.path.join(data_dir, 'lif', fname[:-5] + '.lif') fname_in = os.path.join(data_dir, 'lif', fname) fname_out = os.path.join(data_dir, 'top', fname[:-5] + '.lif') ensure_directory(fname_out) # lif_in = Container(fname_in).payload try: lif_in = LIF(fname_in) except FileNotFoundError: print("Warning: file '%s' does not exist" % fname_in) return lif_out = LIF(json_object=lif_in.as_json()) # the following three are just to save some space, we get them from the lif # file anyway lif_out.text.value = None lif_out.text.source = fname_in lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)
def convert_file(self, fname, infile, outfile): print(infile) lif = LIF() lif.text.value = None lif.metadata['relations'] = {} # PRINT # print(self.relations[0]) for relobj, subj in self.inverted_rels.get(fname, []): # PRINT # print(subj, relobj) lif.metadata['relations'].setdefault(relobj, []).append(subj) lif.write(outfile, pretty=True)
def lookup_technologies(data_dir, fname): subdir = os.path.split(fname)[0] pos_file = os.path.join(data_dir, 'pos', subdir, "%s.pos.lif" % subdir) tex_file = os.path.join(data_dir, 'tex', subdir, "%s.lup.lif" % subdir) ensure_directory(tex_file) lif = Container(pos_file).payload lif_tex = LIF(json_object=lif.as_json()) pos_view = lif.get_view('v2') tex_view = create_view('tex', 'Technology', 'dtriac-pipeline:lookup.py') lif_tex.views = [tex_view] tokens = [a for a in pos_view.annotations if a.type.endswith('Token')] _lookup_technologies_in_tokens(lif, tokens, tex_view) lif_tex.write(fname=tex_file, pretty=True)
def create_lif_file(src_file, lif_file, test=False): HEADER_FILE.write("\n%s\n\n" % src_file) FOOTER_FILE.write("\n%s\n\n" % src_file) with open(src_file, encoding='utf8') as fh_in, \ open(lif_file, 'w', encoding='utf8') as fh_out: lif_obj = LIF() page_view = create_page_view() lif_obj.views.append(page_view) text = StringIO() offset = 0 page = Page(offset) for line in fh_in: if line.startswith(u"\U0001F4C3"): page.parse(line) offset = page.end text.write(page.text) anno = page.as_annotation() #print(anno.as_json()) page_view.annotations.append(anno) page = Page(offset) else: page.add(line) lif_obj.text.value = text.getvalue() container = create_container(lif_obj) fh_out.write(json.dumps(container.as_json(), indent=4)) if test: test_lif_file(lif_file)
def __init__(self, fname, data_dir, lif_file, mta_file, top_file, ner_file, sen_file, tex_file, wik_file): """Build a single LIF object with all relevant annotations. The annotations themselves are stored in the Annotations object in self.annotations.""" self.id = int(os.path.split(fname)[0]) self.fname = fname self.data_dir = data_dir self.lif = Container(lif_file).payload self.meta = LIF(mta_file) self.wikis = LIF(wik_file).metadata['wikified_es'] self._add_views(ner_file, sen_file, tex_file, top_file) self.lif.metadata["filename"] = self.fname self.lif.metadata["year"] = self._get_year() self.annotations = Annotations(self.id, fname, doc=self, text=self.lif.text.value) self.annotations.text = self.lif.text.value self._collect_allowed_offsets() self._collect_annotations()
def generate_sentence_types(data_dir, fname): subdir = os.path.split(fname)[0] lif_file = os.path.join(data_dir, 'lif', subdir, "tesseract-300dpi-20p.lif") spl_file = os.path.join(data_dir, 'spl', subdir, "%s.spl.lif" % subdir) sen_file = os.path.join(data_dir, 'sen', subdir, "%s.sen.lif" % subdir) ensure_directory(sen_file) if DEBUG: SENTS.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100)) lif = Container(lif_file).payload lif_spl = Container(spl_file).payload lif_sen = LIF(json_object=lif.as_json()) spl_sentences_view = lif_spl.get_view('v2') new_sentences_view = _create_view() lif_sen.views = [new_sentences_view] good_sentences = 0 bad_sentences = 0 for anno in spl_sentences_view.annotations: if anno.type.endswith('Sentence'): sc = SentenceClassifier(lif, anno, WORDS) if sc.is_crap(): if DEBUG: SENTS.write("---- %f\n%s\n\n" % (sc.ratio, repr(sc.text))) anno.features['type'] = 'crap' bad_sentences += 1 else: if DEBUG: SENTS.write("++++ %f\n%s\n\n" % (sc.ratio, repr(sc.text))) anno.features['type'] = 'normal' good_sentences += 1 new_sentences_view.annotations.append(anno) if DEBUG: SENTS.write("\nTOTAL GOOD = {:d}\nTOTAL BAD = {:d}\n\n\n".format(good_sentences, bad_sentences)) lif_sen.write(fname=sen_file, pretty=True)
def _add_view(self, identifier, fname, view_rank): """Load fname as either a LIF object or a Container object and select the specified view, indicated by an index in the view list. Add the identifier to this view and add it to the list of views. Note that some files contain LIF objects and others contain Containers with LIF embedded. The view we are looking for is the first or second, depending on how the processor for those data was set up.""" try: view = Container(fname).payload.views[view_rank] except KeyError: view = LIF(fname).views[view_rank] view.id = identifier self.lif.views.append(view)
def generate_lif(txt, vnc): """ * txt is a plain text file only with the original text value. * vnc (verbnetclass) is a output from clearwsd file (mostly in conll format) This function will generate a LIF json file using disambiguation annotation encoded in the vnc file, using txt as top-level `text` field. """ t = open(txt, encoding="utf-8") v = open(vnc, encoding="utf-8") lif_obj = LIF() cont_obj = Container() cont_obj.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif" cont_obj.payload = lif_obj raw_text = t.read() t.close() lif_obj.text.value = raw_text vnc_view = View() lif_obj.views.append(vnc_view) vnc_view.id = "verbnettag" vnc_view.metadata['contains'] = {vocab('SemanticTag'): {}} annotations = [line for line in v if line.startswith('#')] v.close() for annotation in annotations: splitted = annotation.split('\t')[0].split() oid = splitted[1] osent = splitted[2] otoken = splitted[3] olemma = " ".join(splitted[4:-1]) # some lemmas have space inside olabel = splitted[-1] properly_annotated = re.match(r'\d+\[(\d+),(\d+)\]', otoken) if properly_annotated is None: continue s, e = map(int, properly_annotated.groups()) ann = {} ann["id"] = "vnc_" + oid ann["start"] = s ann["end"] = e ann["@type"] = vocab("SemanticTag") ann["features"] = { "tags": [olabel], "type": "VerbNetClass", "lemma": olemma, "text": raw_text[s:e] } ann_obj = Annotation(ann) vnc_view.annotations.append(ann_obj) cont_obj.write()
def _add_view(self, identifier, fname, view_id): """Load fname as either a LIF object or a Container object and select the specified view, indicated by an index in the view list. Add the identifier to this view and add it to the list of views.""" # Note that some files contain LIF objects and others contain Containers # with LIF embedded. The view we are looking for is the first or second, # depending on how the processor for those data was set up. try: view = Container(fname).payload.views[view_id] except KeyError: # this happens when we try to get a discriminator attribute from a LIF object view = LIF(fname).views[view_id] view.id = identifier self.lif.views.append(view)
def __init__(self, fname, data_dir, lif_file, top_file, har_file): """Build a single LIF object with all relevant annotations. The annotations themselves are stored in the Annotations object in self.annotations.""" self.id = fname self.fname = fname self.data_dir = data_dir self.lif = LIF(json_file=lif_file) self.top = LIF(json_file=top_file) self.har = LIF(json_file=har_file) # NOTE: no idea why this was needed # TODO: there is an error in lif.py in line 80 where the json object is # handed in as the id fix_view('doc', self.lif.views[0]) fix_view('top', self.top.views[0]) self.lif.views.append(self.top.views[0]) self.annotations = Annotations(self.id, fname, doc=self, text=self.lif.text.value) self.annotations.text = self.lif.text.value self._collect_authors() self._collect_topics() self._collect_relations()
def generate_sentence_types(ttk, sen, words): for fname in os.listdir(ttk): if not fname.endswith('.lif'): continue print("{} ... ".format(os.path.basename(fname))) if DEBUG: GOOD.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100)) BAD.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100)) fname_in = os.path.join(ttk, fname) fname_out = os.path.join(sen, fname) lif_in = LIF(fname_in) lif_out = LIF(json_object=lif_in.as_json()) sentences_view = _create_view() lif_out.views = [sentences_view] good_sentences = 0 bad_sentences = 0 view = lif_in.get_view('v1') for anno in view.annotations: if anno.type.endswith('Sentence'): sc = SentenceClassifier(lif_in, anno, words) if sc.is_crap(): if DEBUG: BAD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text)) anno.features['type'] = 'crap' bad_sentences += 1 else: if DEBUG: GOOD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text)) anno.features['type'] = 'normal' good_sentences += 1 sentences_view.annotations.append(anno) if DEBUG: print(" (good={:d} bad={:d})".format(good_sentences, bad_sentences)) lif_out.write(fname=fname_out, pretty=True) #break print
def create_lif_file(json_file, lif_file, txt_file, test=False): print("Creating {}".format(lif_file)) with codecs.open(json_file, encoding='utf8') as fh_in, \ codecs.open(lif_file, 'w', encoding='utf8') as fh_out_lif, \ codecs.open(txt_file, 'w', encoding='utf8') as fh_out_txt: json_obj = json.loads(fh_in.read()) lif_obj = LIF() _add_metadata(lif_obj, json_obj) _add_view(lif_obj, json_obj) _add_rest(lif_obj, json_obj) container = Container() container.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif" container.payload = lif_obj fh_out_lif.write(json.dumps(container.as_json(), indent=4)) fh_out_txt.write(container.payload.text.value) if test: test_lif_file(lif_file)
def _collect_data(data_dir, filelist, start, end): all_data = [] # especially the first two occur in most abstracts so let's ignore them words_to_ignore = {'title', 'abstract', 'result', 'study'} for n, fname in elements(filelist, start, end): print(" %07d %s" % (n, fname)) #fpath = os.path.join(data_dir, 'lif', fname[:-5] + '.lif') fpath = os.path.join(data_dir, 'lif', fname) try: lif = LIF(fpath) text_data = prepare_text_for_lda(lif.text.value) text_data = [w for w in text_data if w not in words_to_ignore] all_data.append(text_data) except FileNotFoundError: pass token_count = sum([len(d) for d in all_data]) print('\nToken count = %d' % token_count) return all_data
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary): topic_id = 0 fname_in = os.path.join(data_dir, 'lif', fname[:-4] + '.lif') fname_out = os.path.join(data_dir, 'top', fname[:-4] + '.lif') ensure_directory(fname_out) lif_in = Container(fname_in).payload lif_out = LIF(json_object=lif_in.as_json()) # just to save some space, we get them from the lif file anyway lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)
def generate_topics(lif, top): lda = load_model() topic_idx = { topic_id: topic for topic_id, topic in lda.print_topics(num_topics=NUM_TOPICS) } dictionary = load_dictionary() for fname in os.listdir(lif): if not fname.endswith('.lif'): continue # if not fname.startswith('z'): continue topic_id = 0 print("{}".format(os.path.basename(fname))) fname_in = os.path.join(lif, fname) fname_out = os.path.join(top, fname) lif_in = Container(fname_in).payload lif_out = LIF(json_object=lif_in.as_json()) # just to save some space, we get them from the lif file anyway lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)
def wikify_lif(in_f, wikifier): in_lif = Container(in_f).payload out_lif = LIF(json_object=in_lif.as_json()) out_lif.views = [] out_lif.metadata["wikified_es"] = wikifier.wikify(out_lif.text.value) return out_lif
class Document(object): def __init__(self, fname, data_dir, lif_file, top_file, har_file): """Build a single LIF object with all relevant annotations. The annotations themselves are stored in the Annotations object in self.annotations.""" self.id = fname self.fname = fname self.data_dir = data_dir self.lif = LIF(json_file=lif_file) self.top = LIF(json_file=top_file) self.har = LIF(json_file=har_file) # NOTE: no idea why this was needed # TODO: there is an error in lif.py in line 80 where the json object is # handed in as the id fix_view('doc', self.lif.views[0]) fix_view('top', self.top.views[0]) self.lif.views.append(self.top.views[0]) self.annotations = Annotations(self.id, fname, doc=self, text=self.lif.text.value) self.annotations.text = self.lif.text.value self._collect_authors() self._collect_topics() self._collect_relations() def get_view(self, identifier): return self.lif.get_view(identifier) def _collect_authors(self): """Just get the authors from the metadata and put them in the index.""" def okay(a): # need to do this because the filter in covid.py is faulty return len(a) > 3 and not a[0] == ' ' and not a[-1] == ' ' self.annotations.authors = [ a for a in self.lif.metadata['authors'] if okay(a) ] def _collect_topics(self): """Collect the topics and put them on a list in the index.""" view = self.get_view("top") for annotation in view.annotations: if annotation.type.endswith('SemanticTag'): topic_name = annotation.features['topic_name'] self.annotations.topics.append(topic_name) for topic_element in topic_name.split(): self.annotations.topic_elements.append(topic_element) self.annotations.topic_elements = sorted( set(self.annotations.topic_elements)) def _collect_relations(self): added = False for relobj, subjs in self.har.metadata['relations'].items(): self.annotations.containers.append(relobj) self.annotations.proteins.append(relobj.rsplit('-', 2)[0]) for subj in subjs: self.annotations.proteins.append(subj) if relobj in self.annotations.relations: added = True self.annotations.relations[relobj].append(subj) # print(self.annotations.proteins) # print(self.annotations.containers) # if added: # print(self.annotations.relations) def write(self, dirname): self.annotations.write(os.path.join(dirname, self.fname), self.lif.metadata["year"]) def pp(self, prefix=''): views = ["%s:%d" % (view.id, len(view)) for view in self.lif.views] print("%s<Document id=%s '%s'>" % (prefix, self.id, self.fname)) print(" <Views %s>" % ' '.join(views)) print(" %s\n" % self.annotations)
def get_lif(fpath): try: lif = Container(fpath).payload except: lif = LIF(fpath) return lif
class Converter(object): """Converts the JSON from a COVID file into a LIF document.""" # TODO: add the directory of the sourcefile to the metadata # TODO: (this is to destinguish between the licenses) def __init__(self, infile, outfile, metadata): self.infile = infile self.outfile = outfile self.doc = CovidDoc(self.infile, metadata) def convert(self): print('Converting', os.path.basename(self.infile)) if not self.doc.is_complete(): print('skipping') return with open(self.outfile, 'w') as fh: self._setup() self._collect_metadata() self._add_abstract() self._add_sections() self._finish() def _setup(self): Identifiers.reset() self.p = 0 self.lif = LIF() self.text = StringIO() self.view = View('docstruct') def _collect_metadata(self): self.lif.metadata['title'] = self.doc.title self.lif.metadata['sha'] = self.doc.id self.lif.metadata['pmid'] = self.doc.pmid self.lif.metadata['year'] = self.doc.year self.lif.metadata['authors'] = [] for author in self.doc.authors: fullname = "%s %s" % (author['first'], author['last']) # TODO: this is wrong, the test always succeeds if ' ' in fullname: # this filters out all the short single names including the deceased sign self.lif.metadata['authors'].append(fullname) def _add_docelement_anno(self, docelement_type, p1, p2): self.view.add( Annotation({ 'id': Identifiers.new_id('de'), '@type': 'Section', 'start': p1, 'end': p2, 'features': { 'section_type': docelement_type } })) def _add_abstract(self): # TODO: would like to add the section header # TODO: should make sure that the docelement ends not after the newlines abstract_p0 = self.p for text_str in self.doc.abstract: text_str += u"\n\n" chars = len(text_str) self.p += chars self.text.write(text_str) self._add_docelement_anno('Abstract', abstract_p0, self.p) def _add_sections(self): # TODO: add section header previous_header = None section_p0 = self.p for header_str, text_str in self.doc.body_text: text_str += u"\n\n" header_str += u"\n\n" chars = len(text_str) self.p += chars if header_str != previous_header: # fh.write(header_str) previous_header = header_str self.text.write(text_str) self._add_docelement_anno('Paragraph', section_p0, self.p) section_p0 = self.p def _finish(self): """Gather it all up and write output.""" self.lif.text = Text(json_obj={ 'language': 'en', '@value': self.text.getvalue() }) self.lif.views.append(self.view) self.lif.write(self.outfile, pretty=True)
def _setup(self): Identifiers.reset() self.p = 0 self.lif = LIF() self.text = StringIO() self.view = View('docstruct')