Ejemplo n.º 1
0
class TripleExtractor():
    def __init__(self):
        self.regex_pattern = r'[^\x00-\x7F]+' 
        self.openie = StanfordOpenIE()

    def clean_text(self, text):
        return  re.sub(self.regex_pattern, ' ', text)

    def extract_entities(self, text):
        return self.nlp.ner(text)

    def get_triples(self, text):
        triples = self.openie.annotate(text)
        return triples
Ejemplo n.º 2
0
class OpenIEBaselineModel:
    def __init__(self):
        from openie import StanfordOpenIE
        self.openie_client = StanfordOpenIE()

        self.spacy_nlp = spacy.load("en_core_web_sm")

    def predict(self, inst, supporting_facts):
        ent2doc = dict(inst["context"])
        reasoning_steps = []

        for sup_ent, sup_sent_id in supporting_facts:
            if sup_sent_id > len(ent2doc[sup_ent]):
                continue
            #
            # sup_sent = list(self.spacy_nlp(ent2doc[sup_ent][sup_sent_id]).sents)
            #
            # if len(sup_sent) == 0:
            #     continue
            #
            # sup_sent = sup_sent[0]
            # sup_sent = [sup_ent if tk.text in ["it", "they", "she", "he"] else str(tk) for tk in sup_sent]
            # sup_sent = " ".join(sup_sent)
            sup_sent = ent2doc[sup_ent][sup_sent_id]

            for triplet in self.openie_client.annotate(sup_sent):
                if triplet["subject"] in ["it", "they", "she", "he"]:
                    triplet["subject"] = sup_ent

                reasoning_steps += [(
                    sup_ent,
                    sup_sent_id,
                    (triplet["subject"], triplet["relation"],
                     triplet["object"]),
                )]

        return reasoning_steps
Ejemplo n.º 3
0
class ExtractInformation:
    IS_GPU = True
    SUBJECT = 'subject'
    SUBJECT_ENTITY = 'subject_entity'
    RELATION = 'relation'
    OBJECT = 'object'
    OBJECT_ENTITY = 'object_entity'

    ENTITY_NAME = 'name'
    ENTITY_TYPE = 'entity_type'

    ENTITY_SUBJECT_OTHER = 'subject_other'
    ENTITY_OBJECT_OTHER = 'object_other'

    def __init__(self, modelSpacy='en_core_web_lg', modelCoref='en'):
        print(os.path.dirname(spacy.__file__))
        if ExtractInformation.IS_GPU:
            spacy.prefer_gpu()

        self.modelSpacy = modelSpacy
        self.modelCoref = modelCoref
        self.stanfordClient = StanfordOpenIE()

        self.nlpCoref, self.nlpSpacy = self.initSpacy(modelSpacy, modelCoref)

    def initSpacy(self, modelSpacy, modelCoref):
        nlpSpacy = spacy.load(modelSpacy)

        nlpCoref = spacy.load('en')
        coref = neuralcoref.NeuralCoref(nlpCoref.vocab)
        nlpCoref.add_pipe(coref, name=modelCoref)

        return nlpCoref, nlpSpacy

    #Stage 1: replace Pronouns To Noun, example: My sister has a dog. She loves him. => Cluster: [My sister: [My sister, She], a dog: [a dog, him]]
    def replacePronounsToNoun(self, nlp, inputText):
        #todo unicode input Text
        #ouputText = unicode(inputText)
        ouputText = inputText
        doc = nlp(inputText)
        if (doc._.has_coref):
            ouputText = doc._.coref_resolved
        return doc._.has_coref, ouputText

    #Stage 2: Extract Entities
    def extractEntities(self, nlp, inputText):
        doc = nlp(inputText)
        entities = []
        for ent in doc.ents:
            entities.append({
                ExtractInformation.ENTITY_NAME: ent.text,
                ExtractInformation.ENTITY_TYPE: ent.label_
            })
        return entities

    #Stage 3: Extract Triple
    def extractTriple(self, inputText):
        hasCoref, inputText = self.replacePronounsToNoun(
            self.nlpCoref, inputText)

        #todo similaty relation
        tripleStanfords = self.extractTripleStanfordOpenIE(inputText)
        tripleSpacys = self.extractTripleSpacy(self.nlpSpacy, inputText)

        tripleTemps = tripleStanfords
        for tripleStanford in tripleStanfords:
            subject1 = tripleStanford.get(ExtractInformation.SUBJECT)
            relation1 = tripleStanford.get(ExtractInformation.RELATION)
            object1 = tripleStanford.get(ExtractInformation.OBJECT)
            for tripleSpacy in tripleSpacys:
                subject2 = tripleSpacy.get(ExtractInformation.SUBJECT)
                relation2 = tripleSpacy.get(ExtractInformation.RELATION)
                object2 = tripleSpacy.get(ExtractInformation.OBJECT)

                if ((subject1 == subject2)):
                    if ((object1 == object2) or (object1 in object2)):
                        text1 = self.nlpSpacy(relation1)
                        text2 = self.nlpSpacy(relation2)
                        if (text1.similarity(text2) > 0.6):
                            tripleTemps.remove(tripleStanford)
                            break

        triples = tripleTemps + tripleSpacys

        for triple in triples:
            subjectEnts = self.nlpSpacy(triple.get(ExtractInformation.SUBJECT))
            triple[ExtractInformation.SUBJECT_ENTITY] = [
                (e.text, e.start_char, e.end_char, e.label_)
                for e in subjectEnts.ents
            ]

            objectEnts = self.nlpSpacy(triple.get(ExtractInformation.OBJECT))
            triple[ExtractInformation.OBJECT_ENTITY] = [
                (e.text, e.start_char, e.end_char, e.label_)
                for e in objectEnts.ents
            ]
        return triples

    def extractTripleStanfordOpenIE(self, inputText):
        triples = []
        try:
            triples = self.stanfordClient.annotate(inputText)
        except Exception as exception:
            print("--- extract Triple Stanford OpenIE Error " + str(exception))
        return triples

    def extractTripleSpacy(self, nlp, inputText):
        docSeparate = nlp(inputText)
        sentences = [sent.string.strip() for sent in docSeparate.sents]
        triples = []

        for sentence in sentences:
            doc = nlp(sentence)
            spans = list(doc.ents) + list(doc.noun_chunks)
            for span in spans:
                span.merge()

            for ent in doc.ents:
                preps = [
                    prep for prep in ent.root.head.children
                    if prep.dep_ == "prep"
                ]
                for prep in preps:
                    for child in prep.children:
                        triples.append({
                            ExtractInformation.SUBJECT:
                            ent.text,
                            ExtractInformation.RELATION:
                            "{} {}".format(ent.root.head, prep),
                            ExtractInformation.OBJECT:
                            child.text
                        })
        return triples

    def trainAdditionalEntity(self,
                              train_data,
                              label,
                              nlp,
                              model=None,
                              n_iter=30):
        if ("ner" not in nlp.pipe_names):
            ner = nlp.create_pipe("ner")
            nlp.add_pipe(ner)
        else:
            ner = nlp.get_pipe("ner")
        ner.add_label(label)

        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.resume_training()

        # get names of other pipes to disable them during training
        pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [
            pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
        ]

        # only train NER
        with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
            # show warnings for misaligned entity spans once
            warnings.filterwarnings("once",
                                    category=UserWarning,
                                    module='spacy')

            sizes = compounding(1.0, 4.0, 1.001)
            # batch up the examples using spaCy's minibatch
            for itn in range(n_iter):
                random.shuffle(train_data)
                batches = minibatch(train_data, size=sizes)
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(texts,
                               annotations,
                               sgd=optimizer,
                               drop=0.35,
                               losses=losses)
                print("Losses", losses)

        return nlp

    def saveModel(self, output_dir, nlp, new_model_name):
        if output_dir is not None:
            output_dir = Path(output_dir)
            if not output_dir.exists():
                output_dir.mkdir()
            nlp.meta["name"] = new_model_name  # rename model
            nlp.to_disk(output_dir)
            print("Saved model to", output_dir)