def generate_sentence_types(ttk, sen, words): for fname in os.listdir(ttk): if not fname.endswith('.lif'): continue print("{} ... ".format(os.path.basename(fname))) if DEBUG: GOOD.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100)) BAD.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100)) fname_in = os.path.join(ttk, fname) fname_out = os.path.join(sen, fname) lif_in = LIF(fname_in) lif_out = LIF(json_object=lif_in.as_json()) sentences_view = _create_view() lif_out.views = [sentences_view] good_sentences = 0 bad_sentences = 0 view = lif_in.get_view('v1') for anno in view.annotations: if anno.type.endswith('Sentence'): sc = SentenceClassifier(lif_in, anno, words) if sc.is_crap(): if DEBUG: BAD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text)) anno.features['type'] = 'crap' bad_sentences += 1 else: if DEBUG: GOOD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text)) anno.features['type'] = 'normal' good_sentences += 1 sentences_view.annotations.append(anno) if DEBUG: print(" (good={:d} bad={:d})".format(good_sentences, bad_sentences)) lif_out.write(fname=fname_out, pretty=True) #break print
class Document(object): def __init__(self, fname, data_dir, lif_file, top_file, har_file): """Build a single LIF object with all relevant annotations. The annotations themselves are stored in the Annotations object in self.annotations.""" self.id = fname self.fname = fname self.data_dir = data_dir self.lif = LIF(json_file=lif_file) self.top = LIF(json_file=top_file) self.har = LIF(json_file=har_file) # NOTE: no idea why this was needed # TODO: there is an error in lif.py in line 80 where the json object is # handed in as the id fix_view('doc', self.lif.views[0]) fix_view('top', self.top.views[0]) self.lif.views.append(self.top.views[0]) self.annotations = Annotations(self.id, fname, doc=self, text=self.lif.text.value) self.annotations.text = self.lif.text.value self._collect_authors() self._collect_topics() self._collect_relations() def get_view(self, identifier): return self.lif.get_view(identifier) def _collect_authors(self): """Just get the authors from the metadata and put them in the index.""" def okay(a): # need to do this because the filter in covid.py is faulty return len(a) > 3 and not a[0] == ' ' and not a[-1] == ' ' self.annotations.authors = [ a for a in self.lif.metadata['authors'] if okay(a) ] def _collect_topics(self): """Collect the topics and put them on a list in the index.""" view = self.get_view("top") for annotation in view.annotations: if annotation.type.endswith('SemanticTag'): topic_name = annotation.features['topic_name'] self.annotations.topics.append(topic_name) for topic_element in topic_name.split(): self.annotations.topic_elements.append(topic_element) self.annotations.topic_elements = sorted( set(self.annotations.topic_elements)) def _collect_relations(self): added = False for relobj, subjs in self.har.metadata['relations'].items(): self.annotations.containers.append(relobj) self.annotations.proteins.append(relobj.rsplit('-', 2)[0]) for subj in subjs: self.annotations.proteins.append(subj) if relobj in self.annotations.relations: added = True self.annotations.relations[relobj].append(subj) # print(self.annotations.proteins) # print(self.annotations.containers) # if added: # print(self.annotations.relations) def write(self, dirname): self.annotations.write(os.path.join(dirname, self.fname), self.lif.metadata["year"]) def pp(self, prefix=''): views = ["%s:%d" % (view.id, len(view)) for view in self.lif.views] print("%s<Document id=%s '%s'>" % (prefix, self.id, self.fname)) print(" <Views %s>" % ' '.join(views)) print(" %s\n" % self.annotations)