class StanfordCoreferenceResolver(CoreferenceResolver):
    def __init__(self,
                 start_server=True,
                 endpoint=CoreNLPClient.DEFAULT_ENDPOINT):
        self.__client = CoreNLPClient(start_server=start_server,
                                      endpoint=endpoint,
                                      annotators=[
                                          'tokenize', 'ssplit', 'pos', 'lemma',
                                          'ner', 'parse', 'coref'
                                      ],
                                      output_format='json')
        self.__client.start()

    def __del__(self):
        self.__client.stop()

    def resolve_coreferences(self, text, entities):
        annotations = self.__client.annotate(text)

        entity_mention_indices = []
        for chain in annotations.corefChain:
            mention_indices = []
            for mention in chain.mention:
                sentence = annotations.sentence[mention.sentenceIndex]
                token_start = sentence.token[mention.beginIndex]
                token_end = sentence.token[mention.endIndex - 1]
                char_start = token_start.beginChar
                char_end = token_end.endChar
                mention_indices.append((char_start, char_end))
            entity_mention_indices.append(mention_indices)

        entity_sets = [list() for _ in range(len(entity_mention_indices))]
        for entity in entities:
            is_coreferred = False
            for i, mention_indices in enumerate(entity_mention_indices):
                for start_index, end_index in mention_indices:
                    if entity.start_offset >= start_index and entity.end_offset <= end_index:
                        entity_sets[i].append(entity)
                        is_coreferred = True
            if not is_coreferred:
                entity_sets.append([entity])
        return entity_sets
Esempio n. 2
0
class StanfordOpenIE:
    def __init__(self,
                 core_nlp_version: str = '2018-10-05',
                 threads: int = 5,
                 close_after_finish: bool = True):
        self.remote_url = 'http://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format(
            core_nlp_version)
        self.install_dir = Path(os.environ['STANFORD_HOME']).expanduser()
        self.install_dir.mkdir(exist_ok=True)
        if not (self.install_dir / Path(
                'stanford-corenlp-full-{}'.format(core_nlp_version))).exists():
            print('Downloading to %s.' % self.install_dir)
            output_filename = wget.download(self.remote_url,
                                            out=str(self.install_dir))
            print('\nExtracting to %s.' % self.install_dir)
            zf = ZipFile(output_filename)
            zf.extractall(path=self.install_dir)
            zf.close()

        os.environ['CORENLP_HOME'] = str(self.install_dir /
                                         'stanford-corenlp-full-2018-10-05')
        from stanfordnlp.server import CoreNLPClient
        self.close_after_finish = close_after_finish
        self.client = CoreNLPClient(annotators=['openie'],
                                    memory='8G',
                                    threads=threads)

    def get_openie_with_boundary(self,
                                 annotation: Dict,
                                 remove_dup: bool = False) -> List[Triple]:
        triples: List[Triple] = []
        dup: Set['unique'] = set()
        for sentence in annotation['sentences']:
            tokens = sentence['tokens']
            for triple in sentence['openie']:
                new_triple = {}
                for field in ['subject', 'relation', 'object']:
                    text = triple[field]
                    s, e = triple[field + 'Span']
                    s = tokens[s]['characterOffsetBegin']
                    e = tokens[e - 1]['characterOffsetEnd']
                    new_triple[field] = Span(text=text, start=s, end=e)
                key = '\t'.join([
                    '{}-{}'.format(new_triple[field].start,
                                   new_triple[field].end)
                    for field in ['subject', 'relation', 'object']
                ])
                if remove_dup and key in dup:
                    continue
                triples.append(Triple(**new_triple))
                dup.add(key)
        return triples

    def annotate(self,
                 text: str,
                 properties_key: str = None,
                 properties: dict = None,
                 simple_format: bool = True,
                 remove_dup: bool = False,
                 max_len: int = 15000):
        """
        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (str) properties_key: key into properties cache for the client
        :param (dict) properties: additional request properties (written on top of defaults)
        :param (bool) simple_format: whether to return the full format of CoreNLP or a simple dict.
        :return: Depending on simple_format: full or simpler format of triples <subject, relation, object>.
        """
        if len(text) >= max_len:
            return []
        # https://stanfordnlp.github.io/CoreNLP/openie.html
        core_nlp_output = self.client.annotate(text=text,
                                               annotators=['openie'],
                                               output_format='json',
                                               properties_key=properties_key,
                                               properties=properties)
        if simple_format:
            return self.get_openie_with_boundary(core_nlp_output,
                                                 remove_dup=remove_dup)
        else:
            return core_nlp_output

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def __del__(self):
        if self.close_after_finish:
            self.client.stop()
            del os.environ['CORENLP_HOME']
Esempio n. 3
0
class StanfordOpenIE:
    def __init__(self, core_nlp_version: str = '2018-10-05'):
        self.remote_url = 'https://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format(
            core_nlp_version)
        self.install_dir = Path('~/.stanfordnlp_resources/').expanduser()
        self.install_dir.mkdir(exist_ok=True)
        if not (self.install_dir / Path(
                'stanford-corenlp-full-{}'.format(core_nlp_version))).exists():
            print('Downloading from %s.' % self.remote_url)
            output_filename = wget.download(self.remote_url,
                                            out=str(self.install_dir))
            print('\nExtracting to %s.' % self.install_dir)
            zf = ZipFile(output_filename)
            zf.extractall(path=self.install_dir)
            zf.close()

        os.environ['CORENLP_HOME'] = str(self.install_dir /
                                         'stanford-corenlp-full-2018-10-05')
        from stanfordnlp.server import CoreNLPClient
        self.client = CoreNLPClient(annotators=['openie'], memory='8G')

    def annotate(self,
                 text: str,
                 properties_key: str = None,
                 properties: dict = None,
                 simple_format: bool = True):
        """
        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (str) properties_key: key into properties cache for the client
        :param (dict) properties: additional request properties (written on top of defaults)
        :param (bool) simple_format: whether to return the full format of CoreNLP or a simple dict.
        :return: Depending on simple_format: full or simpler format of triples <subject, relation, object>.
        """
        # https://stanfordnlp.github.io/CoreNLP/openie.html
        core_nlp_output = self.client.annotate(text=text,
                                               annotators=['openie'],
                                               output_format='json',
                                               properties_key=properties_key,
                                               properties=properties)
        if simple_format:
            triples = []
            for sentence in core_nlp_output['sentences']:
                for triple in sentence['openie']:
                    triples.append({
                        'subject': triple['subject'],
                        'relation': triple['relation'],
                        'object': triple['object']
                    })
            return triples
        else:
            return core_nlp_output

    def generate_graphviz_graph(self,
                                text: str,
                                png_filename: str = './out/graph.png'):
        """
       :param (str | unicode) text: raw text for the CoreNLPServer to parse
       :param (list | string) png_filename: list of annotators to use
       """
        entity_relations = self.annotate(text, simple_format=True)
        """digraph G {
        # a -> b [ label="a to b" ];
        # b -> c [ label="another label"];
        }"""
        graph = list()
        graph.append('digraph {')
        for er in entity_relations:
            graph.append('"{}" -> "{}" [ label="{}" ];'.format(
                er['subject'], er['object'], er['relation']))
        graph.append('}')

        output_dir = os.path.join('.', os.path.dirname(png_filename))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        out_dot = os.path.join(tempfile.gettempdir(), 'graph.dot')
        with open(out_dot, 'w') as output_file:
            output_file.writelines(graph)

        command = 'dot -Tpng {} -o {}'.format(out_dot, png_filename)
        dot_process = Popen(command, stdout=stderr, shell=True)
        dot_process.wait()
        assert not dot_process.returncode, 'ERROR: Call to dot exited with a non-zero code status.'

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def __del__(self):
        self.client.stop()
        del os.environ['CORENLP_HOME']
Esempio n. 4
0
    os.environ["CORENLP_HOME"] = corenlp_path

    corenlpclient_UD1 = CoreNLPClient(
        properties={'ssplit.isOneSentence': True},
        annotators=[
            'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats'
        ],
        memory='2G',
        be_quiet=False,
        max_char_length=100000,
        output_format='conllu')
    _UD1_Auto = corenlpclient_UD1.annotate(text1)
    # annotators = ['tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats']
    # _UD1_Auto = _UD1_Auto['sentences'][1]['basicDependencies'] # extract only basic dependencies
    print(_UD1_Auto)
    corenlpclient_UD1.stop()

    print(
        convert_const2dep(
            LANG,
            dataset,
            filename='',
            readpath='/02_modelbuilding/02_output/input_temp.parser',
            writepath='/02_modelbuilding/02_output/output_temp.parser',
            format_='UD1',
            usage='experiments'))

    annotated = _parse_segmenttokenize_en(
        'What is the star of the moon? Where is the sea of the trees?',
        usage='production')
    print('Annotated', annotated)
Esempio n. 5
0
    def __populate_Parses(lang, parsejson, new_parsedict):
        """
        """
        # start CoreNLP servers for UD1
        from stanfordnlp.server import CoreNLPClient

        cwd = os.getcwd()
        version = 'stanford-corenlp-full-2018-10-05'
        corenlp_path = re.findall(r'\S*/marta-v2',
                                  cwd)[0] + '/04_utils/' + version
        os.environ["CORENLP_HOME"] = corenlp_path
        if lang == 'en':
            lang = {}  # i.e. CoreNLP defaults to English model
            corenlpclient_UD1 = CoreNLPClient(properties={
                'ssplit.isOneSentence': True,
                'tokenize.whitespace': True
            },
                                              annotators=[
                                                  'tokenize', 'ssplit', 'pos',
                                                  'parse', 'depparse',
                                                  'udfeats'
                                              ],
                                              memory='2G',
                                              be_quiet=True,
                                              max_char_length=100000,
                                              output_format='conllu')
            # parse annotator is necessary to obtain udfeats (for postags)

        if lang == 'fr':
            lang = 'french'
            corenlpclient_UD1 = CoreNLPClient(
                properties=lang,
                annotators=[
                    'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats'
                ],
                memory='2G',
                be_quiet=True,
                max_char_length=100000,
                output_format='conllu'
            )  # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html

        if lang == 'zh':
            lang = 'chinese'
            corenlpclient_UD1 = CoreNLPClient(properties=lang,
                                              annotators=[
                                                  'tokenize', 'ssplit', 'pos',
                                                  'parse', 'depparse',
                                                  'udfeats'
                                              ],
                                              memory='2G',
                                              be_quiet=True,
                                              max_char_length=100000,
                                              output_format='conllu')
            # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html

        # begin processing
        for DocID in parsejson:
            print('Now processing: ', dataset, DocID)
            sentence_offset = 0  # this is the 4th element in a TokenList

            # obtain the gold constituency parses for the document.
            ConstTrees = __obtain_ConstTrees_Gold(
                DocID, readpath='./03_data/{}/{}tbRoot/{}/', lang=LANG)

            for sentence in parsejson[DocID]['sentences']:
                # 1. create a ParsePDTB object
                __parsepdtb = ParsePDTB(
                    lang=LANG,
                    docid=DocID,
                    sentid=sentence_offset,
                    gold_consttree=ConstTrees[sentence_offset],
                    pdtb_version=PDTB_VERSION)

                # 2. add to .RawText and .Words
                __parsepdtb.RawText = " ".join(
                    [word[0] for word in sentence['words']])
                __parsepdtb.Words = sentence['words']

                # 3. add to ConstTree_Auto. generate parse if missing
                if sentence['parsetree'] == '(())\n':
                    _parse = a2_parsers._parse_rawtext2consttree(
                        LANG, __parsepdtb.RawText, tokenized=True)
                    __parsepdtb.ConstTree_Auto = _parse
                else:
                    __parsepdtb.ConstTree_Auto = sentence['parsetree']

                # 3. write to temp file, for converting to SD/UD1 in next steps
                with open('./02_modelbuilding/02_output/input_temp.parser',
                          'w+') as f:
                    f.write(__parsepdtb.ConstTree_Gold)

                # 4. convert constituency parse to gold UD 1.0 and add to DepTree_UD1_Gold
                a2_parsers.convert_const2dep(
                    LANG,
                    dataset,
                    filename='',
                    readpath='/02_modelbuilding/02_output/input_temp.parser',
                    writepath='/02_modelbuilding/02_output/output_temp.parser',
                    format_='UD1',
                    usage='experiments')

                with open('./02_modelbuilding/02_output/output_temp.parser',
                          'r') as f:
                    UD1_Gold_conllu = f.read()

                def __conllu2tuple(conllu_doc):
                    """helper function to convert CoNLL format into 3-tuple used by CoNLL 2016 organisers to store dependency parses
                    """
                    to_list = conllu_doc.split('\n')
                    tokenlist = [
                        i.split('\t')[1] + '-' + i.split('\t')[0]
                        for i in to_list if i != ''
                    ]  # convert  CoNLL line to <wordform>-<token num>
                    tokenlist.insert(0,
                                     'ROOT-0')  # add a root token to the start
                    deptree_gold = [
                        [
                            i.split('\t')[7], tokenlist[int(i.split('\t')[6])],
                            i.split('\t')[1] + '-' + i.split('\t')[0]
                        ] for i in to_list if i != ''
                    ]  # convert to CoNLL 2016 dependencies format
                    return deptree_gold

                __parsepdtb.DepTree_UD1_Gold = __conllu2tuple(UD1_Gold_conllu)

                # 5. automatically generate UD 1.0 constituency parse (from raw text), place into same 3-tuple format as CoNLL 2016 Shared Task,and add to DepTree_UD1_Auto
                UD1_Auto_conllu = corenlpclient_UD1.annotate(
                    __parsepdtb.RawText)
                __parsepdtb.DepTree_UD1_Auto = __conllu2tuple(UD1_Auto_conllu)

                # 6. add PTB-style and UD pos tags to .Words. Each of the variable below contain a list comprising 2-tuples. each tuple is (<wordform>, <part of speech>)

                globals()['pos_PTBGold'] = [
                    i for i in ParentedTree.fromstring(
                        __parsepdtb.ConstTree_Gold).pos() if i[-1] != '-NONE-'
                ]  # gold PTB parses have traces and these causes misalignment with the surface form. we drop these since parsers don't predict traces (Johannsen & Søgaard, 2013)
                globals()['pos_PTBAuto'] = ParentedTree.fromstring(
                    __parsepdtb.ConstTree_Auto).pos()
                globals()['pos_UDGold'] = [(i.split('\t')[1], i.split('\t')[3])
                                           for i in UD1_Gold_conllu.split('\n')
                                           if i != '']
                globals()['pos_UDAuto'] = [(i.split('\t')[1], i.split('\t')[3])
                                           for i in UD1_Auto_conllu.split('\n')
                                           if i != '']

                for postagset in ['PTBGold', 'PTBAuto', 'UDGold', 'UDAuto']:
                    try:
                        _tagset = globals()['pos_' + postagset]
                        assert len(_tagset) == len(__parsepdtb.Words)
                        for idx in range(len(__parsepdtb.Words)):
                            # add the part of speech as a new key in the dictionary for the token in .Words
                            __parsepdtb.Words[idx][1].update(
                                {'PartOfSpeech_' + postagset: _tagset[idx][1]})

                    except AssertionError as e:

                        e.args += (
                            postagset.upper() +
                            " is not of the same size as the .Words attribute for this sentence.",
                        )
                        print(e)
                        print("Continuing to attempt alignment of tokens.")
                        _words = [i[0] for i in __parsepdtb.Words]
                        _words_maxidx = len(_words) - 1

                        #'drop' the additional tokens in _tagset
                        _tagset = [i for i in _tagset if i[0] in _words]
                        _words_curridx = -1  # start with -1
                        for idx in range(len(_tagset)):
                            _words_curridx += 1
                            while __parsepdtb.Words[_words_curridx][
                                    0] != _tagset[idx][
                                        0] and _words_curridx < _words_maxidx:
                                __parsepdtb.Words[_words_curridx][1].update(
                                    {
                                        'PartOfSpeech_' + postagset:
                                        'ParserError'
                                    }
                                )  # place a marker identifying the missing pos tag as an error from parsing
                                _words_curridx += 1
                            __parsepdtb.Words[_words_curridx][1].update(
                                {'PartOfSpeech_' + postagset: _tagset[idx][1]})
                            continue
                        # raise
                sentence_offset += 1  # increase sentence offset before moving to handle next sentence

                try:
                    new_parsedict[DocID].append(__parsepdtb)
                except:
                    new_parsedict[DocID] = [__parsepdtb]

        # shut down the CoreNLP servers
        corenlpclient_UD1.stop()
import matplotlib.colors as colors
import matplotlib.cm as cmx
import networkx as nx
import numpy as np
dic = {}
for x in range(len(listofchar)):
  dic[x] = listofchar[x]

G = nx.from_numpy_matrix(np.array(Graph)) 
H = nx.relabel_nodes(G, dic)
edgenum = len(H.edges)
values = range(edgenum)
jet = cm = plt.get_cmap('jet') 
cNorm  = colors.Normalize(vmin=0, vmax=values[-1])
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
colorList = []

for i in range(edgenum):
      colorVal = scalarMap.to_rgba(values[i])
      colorList.append(colorVal)

nx.draw(H,edge_color=colorList, with_labels= True)
plt.show()
#nx.draw(H, pos, node_color='b', edgelist=edges, edge_color=weights, width=10.0, edge_cmap=plt.cm.Blues)
#nx.draw(H, with_labels=True)

# Shut down the background CoreNLP server
client.stop()

time.sleep(10)
!ps -o pid,cmd | grep java
Esempio n. 7
0
os.environ['CORENLP_HOME'] = './stanford-corenlp-full-2018-10-05'
properties = {
    'ner.model':
    './stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz,'
    './stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz,'
    './stanford-ner-2018-10-16/classifiers/english.conll.4class.distsim.crf.ser.gz'
}
client = CoreNLPClient(annotators=['tokenize', 'pos', 'lemma', 'ner'],
                       memory='8g',
                       endpoint='http://localhost:9001')
doc = client.annotate(text)
for sent in doc.sentence:
    for m in sent.mentions:
        print(m.entityMentionText, '\t\t\t', m.entityType)
client.stop()  ## do not forget to stop the client

# In[ ]:

# In[ ]:

## nltk
nltk.download()  # d-punkt-q
st = StanfordNERTagger(
    'stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
    'stanford-ner-2018-10-16/stanford-ner.jar',
    encoding='utf-8')

rt = 'this is a test, to see the result of nltk.'

tokenized_text = word_tokenize(rt)
class CoreNlp(ComponentBase):
    def __init__(self, config, config_global, logger):
        super(CoreNlp, self).__init__(config, config_global, logger)

        self.cache = self._provide_cache("stanfordnlp_cache",
                                         human_readable=False)

        corenlp_home = config.get("corenlp_home", None)
        if corenlp_home:
            # resolve corenlp_home against the shell's working dir
            os.environ["CORENLP_HOME"] = str(Path.cwd() / Path(corenlp_home))

        self._kwargs = config.pop("corenlp_kwargs", {"annotators": "depparse"})
        self._client = None  # type: Optional[CoreNLPClient]

    def parse_sentence(self, sentence: str, properties: Optional[Dict] = None):
        """
        Run CoreNLP over a sentence.
        :param sentence: a single sentence
        :param properties: additional properties for CoreNLP
        :return: parsing result
        """
        # The same input sentence can result in different annotations depending on the CoreNLP properties specified.
        # We therefore use a cache identifier for the sentence which includes the annotation properties.
        sent_cache_identifier = get_dict_hash(
            {
                "sentence": sentence,
                "properties": properties
            }, shorten=False)

        if not sent_cache_identifier in self.cache:
            # Kludge ahead: We want to cache the parsed sentence provided by CoreNLP, but also want to work with it in
            # a convenient format. A convenient format is the default format (protobuf-based), but that's not
            # pickle-able for the cache. We therefore convert the protobuf-format back into a bytestring and cache that.
            # When reading from the cache, we reassemble the protobuf object.
            req_properties = {"outputFormat": "serialized"}
            if properties is not None:
                req_properties.update(properties)
            doc = self.client.annotate(sentence, properties=req_properties)
            stream = writeToDelimitedString(doc)
            buf = stream.getvalue()
            stream.close()
            self.cache[sent_cache_identifier] = buf
        else:
            buf = self.cache[sent_cache_identifier]
            doc = Document()
            parseFromDelimitedString(doc, buf)

        return doc

    @property
    def client(self):
        if self._client is None:
            self._client = CoreNLPClient(**self._kwargs)
            self._client.start()
        return self._client

    @overrides
    def clean_up(self):
        if self._client is not None:
            self._client.stop()