Beispiel #1
0
def generate_sentence_types(ttk, sen, words):
    for fname in os.listdir(ttk):
        if not fname.endswith('.lif'):
            continue
        print("{} ... ".format(os.path.basename(fname)))
        if DEBUG:
            GOOD.write(">>> %s\n>>> %s\n>>> %s\n\n" %
                       ('-' * 100, fname, '-' * 100))
            BAD.write(">>> %s\n>>> %s\n>>> %s\n\n" %
                      ('-' * 100, fname, '-' * 100))
        fname_in = os.path.join(ttk, fname)
        fname_out = os.path.join(sen, fname)
        lif_in = LIF(fname_in)
        lif_out = LIF(json_object=lif_in.as_json())
        sentences_view = _create_view()
        lif_out.views = [sentences_view]
        good_sentences = 0
        bad_sentences = 0
        view = lif_in.get_view('v1')
        for anno in view.annotations:
            if anno.type.endswith('Sentence'):
                sc = SentenceClassifier(lif_in, anno, words)
                if sc.is_crap():
                    if DEBUG:
                        BAD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text))
                    anno.features['type'] = 'crap'
                    bad_sentences += 1
                else:
                    if DEBUG:
                        GOOD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text))
                    anno.features['type'] = 'normal'
                    good_sentences += 1
                sentences_view.annotations.append(anno)
        if DEBUG:
            print(" (good={:d} bad={:d})".format(good_sentences,
                                                 bad_sentences))
        lif_out.write(fname=fname_out, pretty=True)
        #break
    print
Beispiel #2
0
class Document(object):
    def __init__(self, fname, data_dir, lif_file, top_file, har_file):
        """Build a single LIF object with all relevant annotations. The annotations
        themselves are stored in the Annotations object in self.annotations."""
        self.id = fname
        self.fname = fname
        self.data_dir = data_dir
        self.lif = LIF(json_file=lif_file)
        self.top = LIF(json_file=top_file)
        self.har = LIF(json_file=har_file)
        # NOTE: no idea why this was needed
        # TODO: there is an error in lif.py in line 80 where the json object is
        # handed in as the id
        fix_view('doc', self.lif.views[0])
        fix_view('top', self.top.views[0])
        self.lif.views.append(self.top.views[0])
        self.annotations = Annotations(self.id,
                                       fname,
                                       doc=self,
                                       text=self.lif.text.value)
        self.annotations.text = self.lif.text.value
        self._collect_authors()
        self._collect_topics()
        self._collect_relations()

    def get_view(self, identifier):
        return self.lif.get_view(identifier)

    def _collect_authors(self):
        """Just get the authors from the metadata and put them in the index."""
        def okay(a):
            # need to do this because the filter in covid.py is faulty
            return len(a) > 3 and not a[0] == ' ' and not a[-1] == ' '

        self.annotations.authors = [
            a for a in self.lif.metadata['authors'] if okay(a)
        ]

    def _collect_topics(self):
        """Collect the topics and put them on a list in the index."""
        view = self.get_view("top")
        for annotation in view.annotations:
            if annotation.type.endswith('SemanticTag'):
                topic_name = annotation.features['topic_name']
                self.annotations.topics.append(topic_name)
                for topic_element in topic_name.split():
                    self.annotations.topic_elements.append(topic_element)
        self.annotations.topic_elements = sorted(
            set(self.annotations.topic_elements))

    def _collect_relations(self):
        added = False
        for relobj, subjs in self.har.metadata['relations'].items():
            self.annotations.containers.append(relobj)
            self.annotations.proteins.append(relobj.rsplit('-', 2)[0])
            for subj in subjs:
                self.annotations.proteins.append(subj)
                if relobj in self.annotations.relations:
                    added = True
                    self.annotations.relations[relobj].append(subj)
        # print(self.annotations.proteins)
        # print(self.annotations.containers)
        # if added:
        #    print(self.annotations.relations)

    def write(self, dirname):
        self.annotations.write(os.path.join(dirname, self.fname),
                               self.lif.metadata["year"])

    def pp(self, prefix=''):
        views = ["%s:%d" % (view.id, len(view)) for view in self.lif.views]
        print("%s<Document id=%s '%s'>" % (prefix, self.id, self.fname))
        print("    <Views %s>" % ' '.join(views))
        print("    %s\n" % self.annotations)