Esempio n. 1
0
    def __init__(self, modelSpacy='en_core_web_lg', modelCoref='en'):
        print(os.path.dirname(spacy.__file__))
        if ExtractInformation.IS_GPU:
            spacy.prefer_gpu()

        self.modelSpacy = modelSpacy
        self.modelCoref = modelCoref
        self.stanfordClient = StanfordOpenIE()

        self.nlpCoref, self.nlpSpacy = self.initSpacy(modelSpacy, modelCoref)
Esempio n. 2
0
    def process(self, context):
        super().process()
        structured_tweets = []

        with contextlib.redirect_stdout(None):
            with StanfordOpenIE() as client:
                for index, item in context['_data'].iterrows():
                    triples = []
                    for triple in client.annotate(item['content']):
                        if triple['subject'] != '' and triple[
                                'relation'] != '' and triple['object'] != '':
                            if self._keyword in triple[
                                    'subject'] or self._keyword in triple[
                                        'object']:
                                relation_pos = wn.synsets(triple['relation'])
                                if len(relation_pos):
                                    if relation_pos[0].pos() == 'v':
                                        triple['date'] = item['date']
                                        triples.append(triple)

                    if len(triples):
                        sorted(triples, key=lambda x: len(x['object']))
                        structured_tweets.append(triples[-1])
        df = pd.DataFrame(data=structured_tweets).drop_duplicates(
            subset=['subject', 'relation', 'object'])

        if self._csv:
            df.to_csv(self._csv, index=False)

        x = {'_data': df}
        return x
Esempio n. 3
0
def extraction(text):
    """
    Extract relations between entities present in the news item and packs them in (head, relation, tail) triples.

    :param text: str with input news content
    :return: list of (h, r, t) triples
    """
    nlp = spacy.load('en_core_web_lg')

    #sum_text = summarizer.summarize(text)

    text_resolved = resolve_coreferences(nlp, text)

    # print(text_resolved)

    doc = nlp(text_resolved)
    lemmatized_text = ' '.join([token.lemma_ if (token.pos == 100 or token.pos == 87) else token.text for token in doc])
    # print(lemmatized_text)

    with StanfordOpenIE() as client:
        triples_dict = client.annotate(lemmatized_text)

    valid_ents = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE', 'DATE',
                  'TIME']
    entities = [ent.text for ent in doc.ents if ent.label_ in valid_ents]

    return [(d['subject'], d['relation'], d['object']) for d in triples_dict], list(set(entities))
Esempio n. 4
0
def process_text(text, name):

    top_relations = []
    triples = []

    with StanfordOpenIE() as client:
        # print('Text: %s.' % text)
        for triple in client.annotate(text):
            # print('|-', triple)
            top_relations.append(triple['relation'])
            triples.append(triple)

        terms = set(
            [w for w, _ in Counter(top_relations).most_common(15)]
        )  # if not w in {'is','are','is in','consists of','has','have','controls','is with', 'is represented by','is partially protected by'}])
        print(terms)
        for t in triples:
            #for term in ["hit","shoot","grab","catch","use","blow","destroy","touch","avoid","collide"]:
            #   if term in t['relation']: # in terms:
            print(t)
            #      break

        graph_image = name + '_graph.png'
        client.generate_graphviz_graph(text, graph_image)
        print('Graph generated: %s.' % graph_image)
def process_relation_extraction(text):
    triples = []
    with StanfordOpenIE() as client:
        for triple in client.annotate(text):
            triples.append(triple)

    return pd.DataFrame(triples)
Esempio n. 6
0
def openie_subj(data):
    """
    input:
        data - column of pd.DataFrame() ('content'/'lead'/...)
    output:
        subjects from OpenIE
    """
    dependences = []
    with StanfordOpenIE() as client:
        for item in tqdm(data, total=len(data)):
            dependency = []
            relations = client.annotate(item)
            for r in relations:
                words = tuple(r.values())
                dependency.append(words)
            objects = list(map(lambda x: x[0], dependency))
            c = Counter(objects)
            common = OrderedDict({
                key: val
                for key, val in sorted(
                    c.items(), key=lambda item: item[1], reverse=True)
                if val > 1
            })
            dependences.append(common)
    return dependences
def information_extraction(text):
    with StanfordOpenIE() as client:
        output = client.annotate(text)
    if isinstance(output, list):
        if len(output) != 0:
            return output[0]
        else:
            return np.nan
    return output
Esempio n. 8
0
def tweet_token(tweet):
    with StanfordOpenIE() as client:
        text = tweet
        print('Text: %s.' % text)
        for triple in client.annotate(text):
            # convert token from coreNLP to JSON
            triple_token = json.dumps(triple)
            #triple_token = json.loads(x)
            #print(x)
            return triple_token
Esempio n. 9
0
def stanford_openie():
    from openie import StanfordOpenIE

    with StanfordOpenIE() as client:
        text = 'Barack Obama was born in Hawaii. Richard Manning wrote this sentence.'
        print('Text: %s.' % text)
        for triple in client.annotate(text):
            print('|-', triple)

        graph_image = 'graph.png'
        client.generate_graphviz_graph(text, graph_image)
        print('Graph generated: %s.' % graph_image)
Esempio n. 10
0
def load_ie():
    """
    Load Stanford Open IE based Triple Extractor function

    Returns:
        function: Open IE annotate function to extract fact triple

    """
    print("Loading Open IE Pipeline...")
    from openie import StanfordOpenIE
    client = StanfordOpenIE()
    return client.annotate
Esempio n. 11
0
class TripleExtractor():
    def __init__(self):
        self.regex_pattern = r'[^\x00-\x7F]+' 
        self.openie = StanfordOpenIE()

    def clean_text(self, text):
        return  re.sub(self.regex_pattern, ' ', text)

    def extract_entities(self, text):
        return self.nlp.ner(text)

    def get_triples(self, text):
        triples = self.openie.annotate(text)
        return triples
Esempio n. 12
0
    def SVO_extractor(self, data):
        with StanfordOpenIE() as client:
            svo_pos = {'s_pos' : [], 'v_pos' : [], 'o_pos': [], 'label' : [], 'date' : []}
            for index, row in tqdm(data.iterrows()):
                try:
                    for sentence in client.annotate(row['header']):    
                        svo_pos['s_pos'].append(sentence['subject'])
                        svo_pos['v_pos'].append(sentence['relation'])
                        svo_pos['o_pos'].append(sentence['object'])
                        svo_pos['label'].append(1)
                        svo_pos['date'].append(row['date'])
                except AttributeError:
                    pass

        return svo_pos
def graph_annotations(text, properties, doc_key, graph_out_loc):
    """
    Use philipperemy's openie wrapper to make graphviz renderings of a set of
    annotations.

    parameters:
        text, str: text to anntoate and graph
        properties, dict: properties dict containing affinity cap
        doc_key, str: name of the document, used to create the output file name
        graph_out_loc, str, path to save the output file

    returns: None
    """
    save_name = f'{graph_out_loc}/{doc_key}_openie_graph.png'

    with StanfordOpenIE(properties=properties) as client:
        client.generate_graphviz_graph(text, save_name)
Esempio n. 14
0
def main():
    '''

    Open Information Extraction example using Spacy
    - doc taken from https://en.wikipedia.org/wiki/World_War_II
    - comparative done with StanfordOpenIe library https://github.com/philipperemy/Stanford-OpenIE-Python

    '''

    text = """The Empire of Japan aimed to dominate Asia and the Pacific and was 
        already at war with the Republic of China in 1937, but the world war is 
        generally said to have begun on 1 September 1939 with the invasion of 
        Poland by Germany and subsequent declarations of war on Germany by 
        France and the United Kingdom. From late 1939 to early 1941, in a 
        series of campaigns and treaties, Germany conquered or controlled much 
        of continental Europe, and formed the Axis alliance with Italy and 
        Japan. Under the Molotov-Ribbentrop Pact of August 1939, Germany and the 
        Soviet Union partitioned and annexed territories of their European 
        neighbours, Poland, Finland, Romania and the Baltic states. The war 
        continued primarily between the European Axis powers and the coalition 
        of the United Kingdom and the British Commonwealth, with campaigns 
        including the North Africa and East Africa campaigns, the aerial Battle 
        of Britain, the Blitz bombing campaign, the Balkan Campaign as well as 
        the long-running Battle of the Atlantic. In June 1941, the European Axis 
        powers launched an invasion of the Soviet Union, opening the largest 
        land theatre of war in history, which trapped the major part of the
        Axis' military forces into a war of attrition. In December 1941, Japan 
        attacked the United States and European territories in the Pacific
        Ocean, and quickly conquered much of the Western Pacific."""

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    sentences = list(doc.sents)
    for sentence in sentences:
        subject, verb, attribute = extract_svo(sentence, nlp)
        print("Subject: ", subject, "| Verb: ", verb, "| Obj: ", attribute)

    print("================ With StanfordOpenIe:")
    with StanfordOpenIE() as client:
        for sentence in sentences:
            for triple in client.annotate(sentence.text):
                print('|-', triple)
Esempio n. 15
0
    def run(self):
        if self.config['extraction']['oie'] == 'stanford':
            with StanfordOpenIE() as client:
                if self.config['mode'] == 'qa':
                    for qa in self.input:
                        for utter in qa['utterances']:
                            for sent in utter['sents']:
                                sent['triples'] = client.annotate(
                                    sent['statement'])
                elif self.config['mode'] == 'subtitle':
                    for ep in self.input:
                        for scene in ep:
                            for u in scene['scene']:
                                for sent in u['sents']:
                                    sent['triples'] = client.annotate(
                                        sent['statement'])

            print('Stanford Open IE done..')

        return self.input
Esempio n. 16
0
def start(new_book: str, book_file_name: str):
    """
    function:   Extract the relation between the entities in the book.
                we will use Python3 wrapper for Stanford OpenIE for this job.

    Input:      A string:   "new_book" of the pre-processed book
                A string:   "book_file_name" which is name of the book as stored on Hard disk.

    Returns:    Nothing, it generates the graph and saves it as image. It also outputs the relations in a text file
    """

    # We will take only first 40,000 letters for getting relationships in the books
    # We will then store block of 10,000 letters in each element of a list to make processing easy.

    TEXT = []
    for x in range(0, 40000, 10000):
        TEXT.append(new_book[x:x + 10000])

    relation_file = open("h_entity_relation_list_" + book_file_name + '.txt',
                         'w+')
    relation_file.write(
        "This File contains Entity relations (of nouns) extracted"
        " from first 8,000 words from the book " + book_file_name + "\n")
    relation_file.write(
        "\n--------------------------------------------------\n\n\n")

    # passing text to StanfordOpenIE to process
    with StanfordOpenIE() as client:
        for text in TEXT:
            # print('\nText: \n%s.' % text)
            for triple in client.annotate(text):
                # Below lines check if the relation is between two noun entities or between noun and verb.
                try:
                    if is_relation_good(triple):
                        relation_file.write("|- " + str(triple) + '\n')
                except KeyError:
                    pass
            graph_image = 'h_entity_relation_graph_' + book_file_name + '_' + str(
                random.randint(0, 100000)) + '_.png'
            client.generate_graphviz_graph(text, graph_image)
Esempio n. 17
0
class OpenIEBaselineModel:
    def __init__(self):
        from openie import StanfordOpenIE
        self.openie_client = StanfordOpenIE()

        self.spacy_nlp = spacy.load("en_core_web_sm")

    def predict(self, inst, supporting_facts):
        ent2doc = dict(inst["context"])
        reasoning_steps = []

        for sup_ent, sup_sent_id in supporting_facts:
            if sup_sent_id > len(ent2doc[sup_ent]):
                continue
            #
            # sup_sent = list(self.spacy_nlp(ent2doc[sup_ent][sup_sent_id]).sents)
            #
            # if len(sup_sent) == 0:
            #     continue
            #
            # sup_sent = sup_sent[0]
            # sup_sent = [sup_ent if tk.text in ["it", "they", "she", "he"] else str(tk) for tk in sup_sent]
            # sup_sent = " ".join(sup_sent)
            sup_sent = ent2doc[sup_ent][sup_sent_id]

            for triplet in self.openie_client.annotate(sup_sent):
                if triplet["subject"] in ["it", "they", "she", "he"]:
                    triplet["subject"] = sup_ent

                reasoning_steps += [(
                    sup_ent,
                    sup_sent_id,
                    (triplet["subject"], triplet["relation"],
                     triplet["object"]),
                )]

        return reasoning_steps
Esempio n. 18
0
def get_triples(corpus=''):
	try:
		with StanfordOpenIE() as client:
			return list(dict.fromkeys([(t['subject'], t['relation'], t['object']) for t in client.annotate(corpus)]))
	except:
		return []
Esempio n. 19
0
from flask import Flask, jsonify, request, send_file
from openie import StanfordOpenIE
import math
import os

app = Flask(__name__)

client = StanfordOpenIE()

@app.route('/getGraph', methods=['POST'])
def get_image():
    json = request.json
    note = json['note']
    # note = 'Barack Obama was born in Hawaii. Richard Manning wrote this sentence.'
    graph_image = './graph.png'
    client.generate_graphviz_graph(note, graph_image)
    return send_file(graph_image, mimetype='image/png')

if __name__ == '__main__':
	app.run(debug=True, port=int(os.environ.get('PORT', 9090)), host='0.0.0.0')
    def svo(self):
        if self._svo is None:
            self._svo = StanfordOpenIE()

        return self._svo
        '--interval',
        type=int,
        nargs=2,
        required=True,
        help='Interval of lines to read. (default: %(default)s)')

    args = parser.parse_args()

    maybe_make_directory(args.output_dir)

    return args


args = parse_args()

with StanfordOpenIE() as client:
    with open(args.input_path, 'r', encoding='utf8') as file:
        corpus = file.read().replace('\n', ' ').replace('\r', '')

    triples_corpus = client.annotate(corpus[args.interval[0]:args.interval[1]])
    print('Found %s triples in the corpus.' % len(triples_corpus))

    basename = os.path.basename(args.input_path)
    filename = os.path.splitext(basename)[0]
    with open(
            os.path.join(
                args.output_dir,
                f"{filename}_{args.interval[0]}_{args.interval[1]}.txt"),
            'w') as output_file:
        for triple in triples_corpus:
            output_file.write(str(triple) + "\n")
Esempio n. 22
0
def stanfordOIE(texts):
    with StanfordOpenIE() as client:
        result = []
        for text in texts:
            result.append(client.annotate(text))
        return result
Esempio n. 23
0
class ExtractInformation:
    IS_GPU = True
    SUBJECT = 'subject'
    SUBJECT_ENTITY = 'subject_entity'
    RELATION = 'relation'
    OBJECT = 'object'
    OBJECT_ENTITY = 'object_entity'

    ENTITY_NAME = 'name'
    ENTITY_TYPE = 'entity_type'

    ENTITY_SUBJECT_OTHER = 'subject_other'
    ENTITY_OBJECT_OTHER = 'object_other'

    def __init__(self, modelSpacy='en_core_web_lg', modelCoref='en'):
        print(os.path.dirname(spacy.__file__))
        if ExtractInformation.IS_GPU:
            spacy.prefer_gpu()

        self.modelSpacy = modelSpacy
        self.modelCoref = modelCoref
        self.stanfordClient = StanfordOpenIE()

        self.nlpCoref, self.nlpSpacy = self.initSpacy(modelSpacy, modelCoref)

    def initSpacy(self, modelSpacy, modelCoref):
        nlpSpacy = spacy.load(modelSpacy)

        nlpCoref = spacy.load('en')
        coref = neuralcoref.NeuralCoref(nlpCoref.vocab)
        nlpCoref.add_pipe(coref, name=modelCoref)

        return nlpCoref, nlpSpacy

    #Stage 1: replace Pronouns To Noun, example: My sister has a dog. She loves him. => Cluster: [My sister: [My sister, She], a dog: [a dog, him]]
    def replacePronounsToNoun(self, nlp, inputText):
        #todo unicode input Text
        #ouputText = unicode(inputText)
        ouputText = inputText
        doc = nlp(inputText)
        if (doc._.has_coref):
            ouputText = doc._.coref_resolved
        return doc._.has_coref, ouputText

    #Stage 2: Extract Entities
    def extractEntities(self, nlp, inputText):
        doc = nlp(inputText)
        entities = []
        for ent in doc.ents:
            entities.append({
                ExtractInformation.ENTITY_NAME: ent.text,
                ExtractInformation.ENTITY_TYPE: ent.label_
            })
        return entities

    #Stage 3: Extract Triple
    def extractTriple(self, inputText):
        hasCoref, inputText = self.replacePronounsToNoun(
            self.nlpCoref, inputText)

        #todo similaty relation
        tripleStanfords = self.extractTripleStanfordOpenIE(inputText)
        tripleSpacys = self.extractTripleSpacy(self.nlpSpacy, inputText)

        tripleTemps = tripleStanfords
        for tripleStanford in tripleStanfords:
            subject1 = tripleStanford.get(ExtractInformation.SUBJECT)
            relation1 = tripleStanford.get(ExtractInformation.RELATION)
            object1 = tripleStanford.get(ExtractInformation.OBJECT)
            for tripleSpacy in tripleSpacys:
                subject2 = tripleSpacy.get(ExtractInformation.SUBJECT)
                relation2 = tripleSpacy.get(ExtractInformation.RELATION)
                object2 = tripleSpacy.get(ExtractInformation.OBJECT)

                if ((subject1 == subject2)):
                    if ((object1 == object2) or (object1 in object2)):
                        text1 = self.nlpSpacy(relation1)
                        text2 = self.nlpSpacy(relation2)
                        if (text1.similarity(text2) > 0.6):
                            tripleTemps.remove(tripleStanford)
                            break

        triples = tripleTemps + tripleSpacys

        for triple in triples:
            subjectEnts = self.nlpSpacy(triple.get(ExtractInformation.SUBJECT))
            triple[ExtractInformation.SUBJECT_ENTITY] = [
                (e.text, e.start_char, e.end_char, e.label_)
                for e in subjectEnts.ents
            ]

            objectEnts = self.nlpSpacy(triple.get(ExtractInformation.OBJECT))
            triple[ExtractInformation.OBJECT_ENTITY] = [
                (e.text, e.start_char, e.end_char, e.label_)
                for e in objectEnts.ents
            ]
        return triples

    def extractTripleStanfordOpenIE(self, inputText):
        triples = []
        try:
            triples = self.stanfordClient.annotate(inputText)
        except Exception as exception:
            print("--- extract Triple Stanford OpenIE Error " + str(exception))
        return triples

    def extractTripleSpacy(self, nlp, inputText):
        docSeparate = nlp(inputText)
        sentences = [sent.string.strip() for sent in docSeparate.sents]
        triples = []

        for sentence in sentences:
            doc = nlp(sentence)
            spans = list(doc.ents) + list(doc.noun_chunks)
            for span in spans:
                span.merge()

            for ent in doc.ents:
                preps = [
                    prep for prep in ent.root.head.children
                    if prep.dep_ == "prep"
                ]
                for prep in preps:
                    for child in prep.children:
                        triples.append({
                            ExtractInformation.SUBJECT:
                            ent.text,
                            ExtractInformation.RELATION:
                            "{} {}".format(ent.root.head, prep),
                            ExtractInformation.OBJECT:
                            child.text
                        })
        return triples

    def trainAdditionalEntity(self,
                              train_data,
                              label,
                              nlp,
                              model=None,
                              n_iter=30):
        if ("ner" not in nlp.pipe_names):
            ner = nlp.create_pipe("ner")
            nlp.add_pipe(ner)
        else:
            ner = nlp.get_pipe("ner")
        ner.add_label(label)

        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.resume_training()

        # get names of other pipes to disable them during training
        pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [
            pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
        ]

        # only train NER
        with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
            # show warnings for misaligned entity spans once
            warnings.filterwarnings("once",
                                    category=UserWarning,
                                    module='spacy')

            sizes = compounding(1.0, 4.0, 1.001)
            # batch up the examples using spaCy's minibatch
            for itn in range(n_iter):
                random.shuffle(train_data)
                batches = minibatch(train_data, size=sizes)
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(texts,
                               annotations,
                               sgd=optimizer,
                               drop=0.35,
                               losses=losses)
                print("Losses", losses)

        return nlp

    def saveModel(self, output_dir, nlp, new_model_name):
        if output_dir is not None:
            output_dir = Path(output_dir)
            if not output_dir.exists():
                output_dir.mkdir()
            nlp.meta["name"] = new_model_name  # rename model
            nlp.to_disk(output_dir)
            print("Saved model to", output_dir)
Esempio n. 24
0
    def __init__(self):
        from openie import StanfordOpenIE
        self.openie_client = StanfordOpenIE()

        self.spacy_nlp = spacy.load("en_core_web_sm")
Esempio n. 25
0
def extract_triples(text):
    with StanfordOpenIE() as client:
        triples = [triple for triple in client.annotate(text)]
    return triples
Esempio n. 26
0
 def __init__(self):
     self.regex_pattern = r'[^\x00-\x7F]+' 
     self.openie = StanfordOpenIE()