コード例 #1
0
class Tokenizer:
    def __init__(self) -> None:
        os.environ[
            'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format(
                os.environ['HOME'])
        self.client = CoreNLPClient(annotators=['ssplit'])
        self.client.ensure_alive()
        self.do_lower_case = '-cased' not in config.bert_model
        self.basic_tokenizer: BasicTokenizer \
            = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer

    def tokenize(self, doc: str) -> List[List[Token]]:
        corenlp_annotation = self.client.annotate(doc)
        sentences = []
        for sentence in corenlp_annotation.sentence:
            text = doc[sentence.characterOffsetBegin:sentence.
                       characterOffsetEnd]
            if self.do_lower_case:
                text = text.lower()
            offset = sentence.characterOffsetBegin
            bert_tokens = self.basic_tokenizer.tokenize(text)
            begin = 0
            tokens = []
            for bert_token in bert_tokens:
                word = bert_token
                begin = text.index(word, begin)
                end = begin + len(word)
                tokens.append(Token(word, begin + offset, end + offset))
                begin = end
            if len(tokens) > 0:
                sentences.append(tokens)
        return sentences
コード例 #2
0
def get_corenlp_client(corenlp_path, corenlp_port):
    os.environ["CORENLP_HOME"] = corenlp_path

    assert not is_port_occupied(
        corenlp_port), "Port {} is occupied by other process".format(
            corenlp_port)
    corenlp_client = CoreNLPClient(
        annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'depparse'],
        timeout=60000,
        memory='5G',
        endpoint="http://localhost:%d" % corenlp_port,
        start_server=True,
        be_quiet=False)
    corenlp_client.annotate(
        "hello world",
        annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'depparse'],
        output_format="json")
    return corenlp_client
コード例 #3
0
class StanfordCoreferenceResolver(CoreferenceResolver):
    def __init__(self,
                 start_server=True,
                 endpoint=CoreNLPClient.DEFAULT_ENDPOINT):
        self.__client = CoreNLPClient(start_server=start_server,
                                      endpoint=endpoint,
                                      annotators=[
                                          'tokenize', 'ssplit', 'pos', 'lemma',
                                          'ner', 'parse', 'coref'
                                      ],
                                      output_format='json')
        self.__client.start()

    def __del__(self):
        self.__client.stop()

    def resolve_coreferences(self, text, entities):
        annotations = self.__client.annotate(text)

        entity_mention_indices = []
        for chain in annotations.corefChain:
            mention_indices = []
            for mention in chain.mention:
                sentence = annotations.sentence[mention.sentenceIndex]
                token_start = sentence.token[mention.beginIndex]
                token_end = sentence.token[mention.endIndex - 1]
                char_start = token_start.beginChar
                char_end = token_end.endChar
                mention_indices.append((char_start, char_end))
            entity_mention_indices.append(mention_indices)

        entity_sets = [list() for _ in range(len(entity_mention_indices))]
        for entity in entities:
            is_coreferred = False
            for i, mention_indices in enumerate(entity_mention_indices):
                for start_index, end_index in mention_indices:
                    if entity.start_offset >= start_index and entity.end_offset <= end_index:
                        entity_sets[i].append(entity)
                        is_coreferred = True
            if not is_coreferred:
                entity_sets.append([entity])
        return entity_sets
コード例 #4
0
ファイル: stanford_oie.py プロジェクト: neulab/cord19
class StanfordOpenIE:
    def __init__(self,
                 core_nlp_version: str = '2018-10-05',
                 threads: int = 5,
                 close_after_finish: bool = True):
        self.remote_url = 'http://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format(
            core_nlp_version)
        self.install_dir = Path(os.environ['STANFORD_HOME']).expanduser()
        self.install_dir.mkdir(exist_ok=True)
        if not (self.install_dir / Path(
                'stanford-corenlp-full-{}'.format(core_nlp_version))).exists():
            print('Downloading to %s.' % self.install_dir)
            output_filename = wget.download(self.remote_url,
                                            out=str(self.install_dir))
            print('\nExtracting to %s.' % self.install_dir)
            zf = ZipFile(output_filename)
            zf.extractall(path=self.install_dir)
            zf.close()

        os.environ['CORENLP_HOME'] = str(self.install_dir /
                                         'stanford-corenlp-full-2018-10-05')
        from stanfordnlp.server import CoreNLPClient
        self.close_after_finish = close_after_finish
        self.client = CoreNLPClient(annotators=['openie'],
                                    memory='8G',
                                    threads=threads)

    def get_openie_with_boundary(self,
                                 annotation: Dict,
                                 remove_dup: bool = False) -> List[Triple]:
        triples: List[Triple] = []
        dup: Set['unique'] = set()
        for sentence in annotation['sentences']:
            tokens = sentence['tokens']
            for triple in sentence['openie']:
                new_triple = {}
                for field in ['subject', 'relation', 'object']:
                    text = triple[field]
                    s, e = triple[field + 'Span']
                    s = tokens[s]['characterOffsetBegin']
                    e = tokens[e - 1]['characterOffsetEnd']
                    new_triple[field] = Span(text=text, start=s, end=e)
                key = '\t'.join([
                    '{}-{}'.format(new_triple[field].start,
                                   new_triple[field].end)
                    for field in ['subject', 'relation', 'object']
                ])
                if remove_dup and key in dup:
                    continue
                triples.append(Triple(**new_triple))
                dup.add(key)
        return triples

    def annotate(self,
                 text: str,
                 properties_key: str = None,
                 properties: dict = None,
                 simple_format: bool = True,
                 remove_dup: bool = False,
                 max_len: int = 15000):
        """
        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (str) properties_key: key into properties cache for the client
        :param (dict) properties: additional request properties (written on top of defaults)
        :param (bool) simple_format: whether to return the full format of CoreNLP or a simple dict.
        :return: Depending on simple_format: full or simpler format of triples <subject, relation, object>.
        """
        if len(text) >= max_len:
            return []
        # https://stanfordnlp.github.io/CoreNLP/openie.html
        core_nlp_output = self.client.annotate(text=text,
                                               annotators=['openie'],
                                               output_format='json',
                                               properties_key=properties_key,
                                               properties=properties)
        if simple_format:
            return self.get_openie_with_boundary(core_nlp_output,
                                                 remove_dup=remove_dup)
        else:
            return core_nlp_output

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def __del__(self):
        if self.close_after_finish:
            self.client.stop()
            del os.environ['CORENLP_HOME']
コード例 #5
0
class RelationExtractor:
    def __init__(self, corenlp_home, endpoint='http://localhost:9000', timeout=15000, memory='2G'):
        print('Set up Stanford CoreNLP Server.')

        if os.path.exists(corenlp_home):
            os.environ['CORENLP_HOME'] = corenlp_home
        else:
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), corenlp_home)

        self.client = CoreNLPClient(annotators=['depparse'], endpoint=endpoint, timeout=timeout, memory=memory)
        self.client.annotate('Prepare.')

    def extract(self, text):
        """
        extract relations from text.

        params:
            text: string

        return:
            sentences: [sentence]

            --format--
            sentence: {
                'tokens': [word],
                'relations': [(subject word index, (predicate word index), object word index)]
            }
            word: string
            word index: int
        """
        ann = self.client.annotate(text)

        sentences = []
        for sentence in ann.sentence:
            # extract relations from sentence
            relations = self._extract_by_subj_obj(sentence) + self._extract_by_nmod(sentence)
            # deal with "of" dependency such as "a group of people", replace "group" with "people" in relations
            relations = self._replace_by_of(sentence, relations)

            sentences.append({
                'tokens': [token.word for token in sentence.token],
                'relations': relations
            })
        return sentences

    def _extract_by_subj_obj(self, sentence):
        """
        extract action/verb relations by "nsubj"/"acl" and "dobj"/"acl:relcl" dependency.

        return:
            relations: [(subject word index, (predicate word index), object word index)]
        """
        edges = sentence.enhancedPlusPlusDependencies.edge

        pred2insts = {}
        for edge in edges:
            subj, pred, obj = None, None, None
            if edge.dep == 'nsubj':
                subj, pred = edge.target, edge.source
            elif edge.dep in ['acl', 'acl:relcl']:
                subj, pred = edge.source, edge.target
            elif edge.dep == 'dobj':
                pred, obj = edge.source, edge.target
            else:
                continue

            if pred not in pred2insts:
                pred2insts[pred] = {
                    'subjs': [],
                    'objs': []
                }

            if subj is None:
                pred2insts[pred]['objs'].append(obj)
            else:
                pred2insts[pred]['subjs'].append(subj)

        relations = []
        for pred in pred2insts:
            insts = pred2insts[pred]
            for subj in insts['subjs']:
                for obj in insts['objs']:
                    relations.append((subj - 1, tuple([pred - 1]), obj - 1))

        return relations

    def _extract_by_nmod(self, sentence):
        """
        extract preposition/preposition phrase and spatial relations by "nmod" dependency.

        return:
            relations: [(subject word index, (predicate word index), object word index)]
        """
        edges = sentence.enhancedPlusPlusDependencies.edge

        # case: to find preposition index in tokens
        # mew: to concatenate preposition phrase, such as "in front of"
        # acl/nsubj: to concatenate verb and preposition phrase, such as "park in front of"
        # and: to expand relations by parallel subjects/objects
        dep_idx = {dep: {} for dep in ['case', 'mwe', 'acl', 'nsubj', 'conj:and']}
        for edge in edges:
            if edge.dep not in dep_idx:
                continue

            source, target = edge.source, edge.target
            if edge.dep == 'acl':
                source, target = target, source

            if source not in dep_idx[edge.dep]:
                dep_idx[edge.dep][source] = []
            dep_idx[edge.dep][source].append(target)

        # exclude relations with "of", "for"......
        exclude = ['of', 'for']

        relations = []
        for edge in edges:
            if not edge.dep.startswith('nmod:'):
                continue

            nmod = edge.dep[5:]
            if nmod in exclude:
                continue

            # target should be noun
            if not sentence.token[edge.target - 1].pos.startswith('NN'):
                continue

            # find preposition indice
            if edge.target not in dep_idx['case']:
                continue
            pred = None
            for case in dep_idx['case'][edge.target]:
                word_idc = [case]
                word = sentence.token[case - 1].word
                if case in dep_idx['mwe']:
                    word_idc.extend(dep_idx['mwe'][case])
                    word = '_'.join([sentence.token[idx - 1].word for idx in word_idc])
                if word == nmod:
                    pred = word_idc
                    break
            if pred is None:
                continue

            if sentence.token[edge.source - 1].pos.startswith('NN'):
                # add preposition relations
                relations.append((edge.source, tuple(pred), edge.target))
            else:
                # add preposition phrase relations
                for dep in ['acl', 'nsubj']:
                    if edge.source in dep_idx[dep]:
                        if edge.source + 1 == pred[0]:
                            # concatenate verb and preposition phrase
                            pred.insert(0, edge.source)
                        relations.extend([(source, tuple(pred), edge.target) for source in dep_idx[dep][edge.source]])
                        break

        # expand relations
        expanded_relations = set()
        for relation in relations:
            subj, pred, obj = relation

            equal_insts = {
                'subjs': [subj],
                'objs': [obj]
            }
            for typ in equal_insts:
                inst = equal_insts[typ][0]
                if inst in dep_idx['conj:and']:
                    equal_insts[typ].extend(dep_idx['conj:and'][inst])

            pred = tuple([idx - 1 for idx in pred])
            for subj in equal_insts['subjs']:
                for obj in equal_insts['objs']:
                    expanded_relations.add((subj - 1, pred, obj - 1))

        return list(expanded_relations)
コード例 #6
0
class StanfordOpenIE:
    def __init__(self, core_nlp_version: str = '2018-10-05'):
        self.remote_url = 'https://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format(
            core_nlp_version)
        self.install_dir = Path('~/.stanfordnlp_resources/').expanduser()
        self.install_dir.mkdir(exist_ok=True)
        if not (self.install_dir / Path(
                'stanford-corenlp-full-{}'.format(core_nlp_version))).exists():
            print('Downloading from %s.' % self.remote_url)
            output_filename = wget.download(self.remote_url,
                                            out=str(self.install_dir))
            print('\nExtracting to %s.' % self.install_dir)
            zf = ZipFile(output_filename)
            zf.extractall(path=self.install_dir)
            zf.close()

        os.environ['CORENLP_HOME'] = str(self.install_dir /
                                         'stanford-corenlp-full-2018-10-05')
        from stanfordnlp.server import CoreNLPClient
        self.client = CoreNLPClient(annotators=['openie'], memory='8G')

    def annotate(self,
                 text: str,
                 properties_key: str = None,
                 properties: dict = None,
                 simple_format: bool = True):
        """
        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (str) properties_key: key into properties cache for the client
        :param (dict) properties: additional request properties (written on top of defaults)
        :param (bool) simple_format: whether to return the full format of CoreNLP or a simple dict.
        :return: Depending on simple_format: full or simpler format of triples <subject, relation, object>.
        """
        # https://stanfordnlp.github.io/CoreNLP/openie.html
        core_nlp_output = self.client.annotate(text=text,
                                               annotators=['openie'],
                                               output_format='json',
                                               properties_key=properties_key,
                                               properties=properties)
        if simple_format:
            triples = []
            for sentence in core_nlp_output['sentences']:
                for triple in sentence['openie']:
                    triples.append({
                        'subject': triple['subject'],
                        'relation': triple['relation'],
                        'object': triple['object']
                    })
            return triples
        else:
            return core_nlp_output

    def generate_graphviz_graph(self,
                                text: str,
                                png_filename: str = './out/graph.png'):
        """
       :param (str | unicode) text: raw text for the CoreNLPServer to parse
       :param (list | string) png_filename: list of annotators to use
       """
        entity_relations = self.annotate(text, simple_format=True)
        """digraph G {
        # a -> b [ label="a to b" ];
        # b -> c [ label="another label"];
        }"""
        graph = list()
        graph.append('digraph {')
        for er in entity_relations:
            graph.append('"{}" -> "{}" [ label="{}" ];'.format(
                er['subject'], er['object'], er['relation']))
        graph.append('}')

        output_dir = os.path.join('.', os.path.dirname(png_filename))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        out_dot = os.path.join(tempfile.gettempdir(), 'graph.dot')
        with open(out_dot, 'w') as output_file:
            output_file.writelines(graph)

        command = 'dot -Tpng {} -o {}'.format(out_dot, png_filename)
        dot_process = Popen(command, stdout=stderr, shell=True)
        dot_process.wait()
        assert not dot_process.returncode, 'ERROR: Call to dot exited with a non-zero code status.'

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def __del__(self):
        self.client.stop()
        del os.environ['CORENLP_HOME']
コード例 #7
0
ファイル: evaluator.py プロジェクト: kmzjy110/squall
class Evaluator:
    def __init__(self, tagged_dataset_path, database_path, corenlp_path):
        self.target_values_map = {}
        for filename in os.listdir(tagged_dataset_path):
            filename = os.path.join(tagged_dataset_path, filename)
            print(sys.stderr, 'Reading dataset from', filename)
            with open(filename, 'r', 'utf8') as fin:
                header = fin.readline().rstrip('\n').split('\t')
                for line in fin:
                    stuff = dict(zip(header, line.rstrip('\n').split('\t')))
                    ex_id = stuff['id']
                    original_strings = tsv_unescape_list(stuff['targetValue'])
                    canon_strings = tsv_unescape_list(stuff['targetCanon'])
                    self.target_values_map[ex_id] = to_value_list(
                        original_strings, canon_strings)

        os.environ['CORENLP_HOME'] = corenlp_path
        self.client = CoreNLPClient(annotators="ner".split())
        self.db_path = database_path

    def evaluate(self, predictions):
        num_examples, num_correct = 0, 0
        num_fail = 0

        for pred in predictions:
            table_id = pred['table_id']
            ####   find the exact db file
            db_file = self.db_path + table_id + '.db'
            table_file = self.db_path + "../json/" + table_id + ".json"

            with open(table_file, "r") as f:
                table_json = json.load(f)

            connection = sqlite3.connect(db_file)
            c = connection.cursor()
            results = pred['result']
            for result in results:
                ex_id = result['id']
                sql = result['sql']

                try:
                    sql = requests.get("http://localhost:3000/",
                                       json={
                                           "sql": sql,
                                           "is_list": table_json["is_list"]
                                       }).json()

                    c.execute(sql)

                    answer_list = list()
                    nlp_list = list()
                    for result, in c:
                        result = str(result)
                        ann = self.client.annotate(result)
                        if len(ann.mentions) == 0:
                            nlp_list.append(result)
                        elif len(ann.mentions) > 1:
                            #print('corenlp annotation wrong!', ann.mentions)
                            nlp_list.append(result)
                        else:
                            nlp_list.append(ann.mentions[0].normalizedNER)

                        answer_list.append(result)

                    predicted_values = to_value_list(answer_list)
                except Exception as e:
                    # print("Evaluation failure", e)
                    num_fail += 1
                    predicted_values = list()

                if ex_id not in self.target_values_map:
                    print('WARNING: Example ID "%s" not found' % ex_id)
                else:
                    target_values = self.target_values_map[ex_id]

                    correct = check_denotation(target_values, predicted_values)
                    num_examples += 1
                    if correct:
                        num_correct += 1

        # acc = (num_correct + 1e-9) / (num_examples + 1e-9)
        #print("Failed:", num_fail, "out of", num_examples)

        return num_correct
コード例 #8
0
    print(text1)
    cwd = os.getcwd()
    version = 'stanford-corenlp-full-2018-10-05'
    corenlp_path = re.findall(r'\S*/marta-v2', cwd)[0] + '/04_utils/' + version
    os.environ["CORENLP_HOME"] = corenlp_path

    corenlpclient_UD1 = CoreNLPClient(
        properties={'ssplit.isOneSentence': True},
        annotators=[
            'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats'
        ],
        memory='2G',
        be_quiet=False,
        max_char_length=100000,
        output_format='conllu')
    _UD1_Auto = corenlpclient_UD1.annotate(text1)
    # annotators = ['tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats']
    # _UD1_Auto = _UD1_Auto['sentences'][1]['basicDependencies'] # extract only basic dependencies
    print(_UD1_Auto)
    corenlpclient_UD1.stop()

    print(
        convert_const2dep(
            LANG,
            dataset,
            filename='',
            readpath='/02_modelbuilding/02_output/input_temp.parser',
            writepath='/02_modelbuilding/02_output/output_temp.parser',
            format_='UD1',
            usage='experiments'))
コード例 #9
0
ファイル: preprocess.py プロジェクト: AlainBBenoist/NLP
class Preprocessor(object):
    """
    The preprocessor wraps a corpus object (usually a `HTMLCorpusReader`)
    and manages the stateful tokenization and part of speech tagging into a
    directory that is stored in a format that can be read by the
    `HTMLPickledCorpusReader`. This format is more compact and necessarily
    removes a variety of fields from the document that are stored in the JSON
    representation dumped from the Mongo database. This format however is more
    easily accessed for common parsing activity.
    """
    def __init__(self, corpus, target=None, **kwargs):
        """
        The corpus is the `HTMLCorpusReader` to preprocess and pickle.
        The target is the directory on disk to output the pickled corpus to.
        """
        self.corpus = corpus
        self.target = target
        self.tagger = pos_tagger('spacy')

        # Modification for dibutade
        if model == 'stanford':
            os.environ[
                'CORENLP_HOME'] = 'C:/Users/alain/OneDrive/Ateliers Dibutade/NLP/stanford-corenlp-full-2018-10-05'
            self.pos_tagger = CoreNLPClient(properties='french',
                                            annotators=[
                                                'pos',
                                            ],
                                            timeout=30000,
                                            memory='1G')
        elif model == 'spacy':
            self.nlp = spacy.load('fr_core_news_sm')

    def fileids(self, fileids=None, categories=None):
        """
        Helper function access the fileids of the corpus
        """
        fileids = self.corpus.resolve(fileids, categories)
        if fileids:
            return fileids
        return self.corpus.fileids()

    def abspath(self, fileid):
        """
        Returns the absolute path to the target fileid from the corpus fileid.
        """
        # Find the directory, relative from the corpus root.
        parent = os.path.relpath(os.path.dirname(self.corpus.abspath(fileid)),
                                 self.corpus.root)

        # Compute the name parts to reconstruct
        basename = os.path.basename(fileid)
        name, ext = os.path.splitext(basename)

        # Create the pickle file extension
        basename = name + '.pickle'

        # Return the path to the file relative to the target.
        return os.path.normpath(os.path.join(self.target, parent, basename))

    def tokenize(self, fileid):
        """
        Segments, tokenizes, and tags a document in the corpus. Returns a
        generator of paragraphs, which are lists of sentences, which in turn
        are lists of part of speech tagged words.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            if model == 'original':
                for sent in sent_tokenize(paragraph):
                    print(sent)
                    print(wordpunct_tokenize(sent))
                    print(pos_tag(wordpunct_tokenize(sent)))
                key = input('Continue')
                yield [
                    pos_tag(wordpunct_tokenize(sent))
                    for sent in sent_tokenize(paragraph)
                ]
            elif model == 'stanford':
                # Modification for the CORE NLP package
                ann = self.pos_tagger.annotate(paragraph)
                for sentence in ann.sentence:
                    #print(sentence)
                    #for token in sentence.token :
                    #    print((token.word, token.pos))
                    yield [[(token.word, token.pos)
                            for token in sentence.token]]
            elif model == 'spacy':
                yield [[(token.text, token.pos_) for token in self.nlp(sent)]
                       for sent in sent_tokenize(paragraph)]
            else:  # Default - still to test
                for sent in sent_tokenize(paragraph):
                    yield self.tagger.pos_tag(sent)

    def process(self, fileid):
        """
        For a single file does the following preprocessing work:
            1. Checks the location on disk to make sure no errors occur.
            2. Gets all paragraphs for the given text.
            3. Segments the paragraphs with the sent_tokenizer
            4. Tokenizes the sentences with the wordpunct_tokenizer
            5. Tags the sentences using the default pos_tagger
            6. Writes the document as a pickle to the target location.
        This method is called multiple times from the transform runner.
        """

        # Compute the outpath to write the file to.
        target = self.abspath(fileid)
        parent = os.path.dirname(target)

        # Make sure the directory exists
        if not os.path.exists(parent):
            os.makedirs(parent)

        # Make sure that the parent is a directory and not a file
        if not os.path.isdir(parent):
            raise ValueError(
                "Please supply a directory to write preprocessed data to.")

        # Create a data structure for the pickle
        document = list(self.tokenize(fileid))

        # Open and serialize the pickle to disk
        with open(target, 'wb') as f:
            pickle.dump(document, f, pickle.HIGHEST_PROTOCOL)

        # Clean up the document
        del document

        # Return the target fileid
        return target

    def transform(self, fileids=None, categories=None):
        """
        Transform the wrapped corpus, writing out the segmented, tokenized,
        and part of speech tagged corpus as a pickle to the target directory.
        This method will also directly copy files that are in the corpus.root
        directory that are not matched by the corpus.fileids().
        """

        # Make the target directory if it doesn't already exist
        if not os.path.exists(self.target):
            os.makedirs(self.target)

        # Resolve the fileids to start processing and return the list of
        # target file ids to pass to downstream transformers.
        return [
            self.process(fileid)
            for fileid in self.fileids(fileids, categories)
        ]
コード例 #10
0
ファイル: a_preprocessor.py プロジェクト: hankelvin/marta-v2
    def __populate_Parses(lang, parsejson, new_parsedict):
        """
        """
        # start CoreNLP servers for UD1
        from stanfordnlp.server import CoreNLPClient

        cwd = os.getcwd()
        version = 'stanford-corenlp-full-2018-10-05'
        corenlp_path = re.findall(r'\S*/marta-v2',
                                  cwd)[0] + '/04_utils/' + version
        os.environ["CORENLP_HOME"] = corenlp_path
        if lang == 'en':
            lang = {}  # i.e. CoreNLP defaults to English model
            corenlpclient_UD1 = CoreNLPClient(properties={
                'ssplit.isOneSentence': True,
                'tokenize.whitespace': True
            },
                                              annotators=[
                                                  'tokenize', 'ssplit', 'pos',
                                                  'parse', 'depparse',
                                                  'udfeats'
                                              ],
                                              memory='2G',
                                              be_quiet=True,
                                              max_char_length=100000,
                                              output_format='conllu')
            # parse annotator is necessary to obtain udfeats (for postags)

        if lang == 'fr':
            lang = 'french'
            corenlpclient_UD1 = CoreNLPClient(
                properties=lang,
                annotators=[
                    'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats'
                ],
                memory='2G',
                be_quiet=True,
                max_char_length=100000,
                output_format='conllu'
            )  # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html

        if lang == 'zh':
            lang = 'chinese'
            corenlpclient_UD1 = CoreNLPClient(properties=lang,
                                              annotators=[
                                                  'tokenize', 'ssplit', 'pos',
                                                  'parse', 'depparse',
                                                  'udfeats'
                                              ],
                                              memory='2G',
                                              be_quiet=True,
                                              max_char_length=100000,
                                              output_format='conllu')
            # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html

        # begin processing
        for DocID in parsejson:
            print('Now processing: ', dataset, DocID)
            sentence_offset = 0  # this is the 4th element in a TokenList

            # obtain the gold constituency parses for the document.
            ConstTrees = __obtain_ConstTrees_Gold(
                DocID, readpath='./03_data/{}/{}tbRoot/{}/', lang=LANG)

            for sentence in parsejson[DocID]['sentences']:
                # 1. create a ParsePDTB object
                __parsepdtb = ParsePDTB(
                    lang=LANG,
                    docid=DocID,
                    sentid=sentence_offset,
                    gold_consttree=ConstTrees[sentence_offset],
                    pdtb_version=PDTB_VERSION)

                # 2. add to .RawText and .Words
                __parsepdtb.RawText = " ".join(
                    [word[0] for word in sentence['words']])
                __parsepdtb.Words = sentence['words']

                # 3. add to ConstTree_Auto. generate parse if missing
                if sentence['parsetree'] == '(())\n':
                    _parse = a2_parsers._parse_rawtext2consttree(
                        LANG, __parsepdtb.RawText, tokenized=True)
                    __parsepdtb.ConstTree_Auto = _parse
                else:
                    __parsepdtb.ConstTree_Auto = sentence['parsetree']

                # 3. write to temp file, for converting to SD/UD1 in next steps
                with open('./02_modelbuilding/02_output/input_temp.parser',
                          'w+') as f:
                    f.write(__parsepdtb.ConstTree_Gold)

                # 4. convert constituency parse to gold UD 1.0 and add to DepTree_UD1_Gold
                a2_parsers.convert_const2dep(
                    LANG,
                    dataset,
                    filename='',
                    readpath='/02_modelbuilding/02_output/input_temp.parser',
                    writepath='/02_modelbuilding/02_output/output_temp.parser',
                    format_='UD1',
                    usage='experiments')

                with open('./02_modelbuilding/02_output/output_temp.parser',
                          'r') as f:
                    UD1_Gold_conllu = f.read()

                def __conllu2tuple(conllu_doc):
                    """helper function to convert CoNLL format into 3-tuple used by CoNLL 2016 organisers to store dependency parses
                    """
                    to_list = conllu_doc.split('\n')
                    tokenlist = [
                        i.split('\t')[1] + '-' + i.split('\t')[0]
                        for i in to_list if i != ''
                    ]  # convert  CoNLL line to <wordform>-<token num>
                    tokenlist.insert(0,
                                     'ROOT-0')  # add a root token to the start
                    deptree_gold = [
                        [
                            i.split('\t')[7], tokenlist[int(i.split('\t')[6])],
                            i.split('\t')[1] + '-' + i.split('\t')[0]
                        ] for i in to_list if i != ''
                    ]  # convert to CoNLL 2016 dependencies format
                    return deptree_gold

                __parsepdtb.DepTree_UD1_Gold = __conllu2tuple(UD1_Gold_conllu)

                # 5. automatically generate UD 1.0 constituency parse (from raw text), place into same 3-tuple format as CoNLL 2016 Shared Task,and add to DepTree_UD1_Auto
                UD1_Auto_conllu = corenlpclient_UD1.annotate(
                    __parsepdtb.RawText)
                __parsepdtb.DepTree_UD1_Auto = __conllu2tuple(UD1_Auto_conllu)

                # 6. add PTB-style and UD pos tags to .Words. Each of the variable below contain a list comprising 2-tuples. each tuple is (<wordform>, <part of speech>)

                globals()['pos_PTBGold'] = [
                    i for i in ParentedTree.fromstring(
                        __parsepdtb.ConstTree_Gold).pos() if i[-1] != '-NONE-'
                ]  # gold PTB parses have traces and these causes misalignment with the surface form. we drop these since parsers don't predict traces (Johannsen & Søgaard, 2013)
                globals()['pos_PTBAuto'] = ParentedTree.fromstring(
                    __parsepdtb.ConstTree_Auto).pos()
                globals()['pos_UDGold'] = [(i.split('\t')[1], i.split('\t')[3])
                                           for i in UD1_Gold_conllu.split('\n')
                                           if i != '']
                globals()['pos_UDAuto'] = [(i.split('\t')[1], i.split('\t')[3])
                                           for i in UD1_Auto_conllu.split('\n')
                                           if i != '']

                for postagset in ['PTBGold', 'PTBAuto', 'UDGold', 'UDAuto']:
                    try:
                        _tagset = globals()['pos_' + postagset]
                        assert len(_tagset) == len(__parsepdtb.Words)
                        for idx in range(len(__parsepdtb.Words)):
                            # add the part of speech as a new key in the dictionary for the token in .Words
                            __parsepdtb.Words[idx][1].update(
                                {'PartOfSpeech_' + postagset: _tagset[idx][1]})

                    except AssertionError as e:

                        e.args += (
                            postagset.upper() +
                            " is not of the same size as the .Words attribute for this sentence.",
                        )
                        print(e)
                        print("Continuing to attempt alignment of tokens.")
                        _words = [i[0] for i in __parsepdtb.Words]
                        _words_maxidx = len(_words) - 1

                        #'drop' the additional tokens in _tagset
                        _tagset = [i for i in _tagset if i[0] in _words]
                        _words_curridx = -1  # start with -1
                        for idx in range(len(_tagset)):
                            _words_curridx += 1
                            while __parsepdtb.Words[_words_curridx][
                                    0] != _tagset[idx][
                                        0] and _words_curridx < _words_maxidx:
                                __parsepdtb.Words[_words_curridx][1].update(
                                    {
                                        'PartOfSpeech_' + postagset:
                                        'ParserError'
                                    }
                                )  # place a marker identifying the missing pos tag as an error from parsing
                                _words_curridx += 1
                            __parsepdtb.Words[_words_curridx][1].update(
                                {'PartOfSpeech_' + postagset: _tagset[idx][1]})
                            continue
                        # raise
                sentence_offset += 1  # increase sentence offset before moving to handle next sentence

                try:
                    new_parsedict[DocID].append(__parsepdtb)
                except:
                    new_parsedict[DocID] = [__parsepdtb]

        # shut down the CoreNLP servers
        corenlpclient_UD1.stop()
コード例 #11
0
ファイル: segment_job.py プロジェクト: pltrdy/autoalign
class JobSegmenter(object):
    """
        default_heading_idx(int): level associated with default header
                                  headers are usually h1 -> h6 therefore any
                                  integer > 6 may be ok.
        verbose(bool): well...
        use_tags(bool): whether to tag data (with paragraphs, title, names)
        interventions(bool): switch to intervention segmentation
                             i.e. map paragraphs to intervention in ctm
    """
    def __init__(self,
                 default_heading_idx=DEFAULT_HEADING_IDX,
                 verbose=False,
                 use_tags=False,
                 interventions=False,
                 sentence_slices=False,
                 tfidf=False,
                 sentence_min_length=1):
        #         corenlp_port=9000):
        self.corenlp_client = None
        self.ses = None
        self.default_heading_idx = default_heading_idx
        self.n_headings = 6
        self.paragraph_min_word_length = 2
        self.verbose = verbose
        self.use_tags = use_tags
        self.interventions = interventions
        self.sentence_slices = sentence_slices
        self.tfidf = tfidf
        self.sentence_min_length = sentence_min_length
        # self.corenlp_port = corenlp_port

        # if self.interventions:
        #     raise ValueError("Intervention mode not implemented")

    def __enter__(self):
        return self

    def __exit__(self, *args, **kwargs):
        if self.corenlp_client:
            self.corenlp_client.stop()

    def make_sentence_similarity(self,
                                 word2vec_path=DEFAULT_WORD2VEC_PATH,
                                 n_vectors=DEFAULT_N_VECTOR):
        if self.tfidf:
            from exp_tfidf_lsa_kmeans.tfidf import scoring
            self.tfidf_scoring = scoring
        else:
            from word2vec_fr.sentence_similarity import SentenceEmbeddingSimilarity
            self.ses = SentenceEmbeddingSimilarity(word2vec_path, n_vectors)

    def make_corenlp_client(self,
                            annotators=["tokenize", "ssplit"],
                            endpoint="http://localhost:9000",
                            properties_name="french",
                            properties_dict=None,
                            quiet=True):
        LEGACY_PROPERTIES = {}
        FRENCH_PROPERTIES = {
            "tokenize.language": "French",
            "tokenize.options": "ptb3Dashes=true"
        }
        PROPERTIES = {"legacy": LEGACY_PROPERTIES, "french": FRENCH_PROPERTIES}
        if properties_dict is not None:
            properties = properties_dict
        else:
            if properties_name in PROPERTIES.keys():
                properties = PROPERTIES[properties_name]
            else:
                raise ValueError("Unknow properties '%s'" % properties_name)

        devnull = open(os.devnull)
        stdout = devnull if quiet else sys.stdout
        stderr = devnull if quiet else sys.stderr
        self.corenlp_client = \
            CoreNLPClient(annotators=annotators,
                          endpoint=endpoint,
                          stdout=stdout,
                          stderr=stderr,
                          memory="8G",
                          heapsize="8G",
                          threads=8,
                          timeout=15000,
                          properties=properties
                          )

    def heading_idx(self, style_name):
        if style_name is None:
            return self.default_heading_idx

        if style_name.lower().startswith(
                "heading") or style_name.lower().startswith("titre"):
            try:
                idx = int(style_name[-1]) - 1
                return idx
            except ValueError as e:
                return self.default_heading_idx
        else:
            return self.default_heading_idx

    def is_toc(self, style_name):
        if style_name is None:
            return False
        style_name = style_name.lower()
        return (style_name.lower().startswith("contents")
                or style_name.lower().startswith("toc")
                or style_name.lower().startswith("en-t")
                or style_name.lower().startswith("tm"))

    def is_name(self, style_name):
        if style_name is None:
            return False
        style_name = style_name.lower()
        return (style_name.startswith('nom')
                or style_name.startswith('intervenant'))

    def section(self, level=None, content=[], parent=None):
        if level is None:
            level = self.self.default_heading_idx
        kwargs = locals()
        self = kwargs.pop("self")
        parent = kwargs.pop('parent')
        d = dict(kwargs)
        d["childs"] = []
        if parent is not None:
            d["parent"] = parent
            parent["childs"].append(d)
        return d

    def annotate(self, text):
        """
        Args:
            text(string)
        Returns:
            annotation object
        """
        if self.corenlp_client is None:
            raise ValueError("'self.corenlp_client' is None. "
                             "Use 'make_corenlp_client' before calling "
                             "'annotate'")
        while True:
            try:
                r = self.corenlp_client.annotate(text)
                break
            except (requests.exceptions.ConnectionError,
                    corenlp_client.PermanentlyFailedException,
                    urllib3.exceptions.MaxRetryError):
                print("too many requests, sleeping")
                time.sleep(0.75)
        return r

    def get_sentences(self,
                      words,
                      lower=True,
                      no_minimum=False,
                      with_scores=False,
                      debug=False):
        # return [_.split() for _ in text.split(".")]
        return self.corenlp_get_sentences(words,
                                          lower=lower,
                                          no_minimum=no_minimum,
                                          with_scores=with_scores,
                                          debug=debug)

    def corenlp_get_sentences(self,
                              words,
                              lower=True,
                              no_minimum=False,
                              with_scores=False,
                              debug=False):
        """
        Args:
            sentences: list[list[word]] if not with_scores
                       list[list[ [word; score]] otherwise
        Returns:
            sentences: list of sentences (list of word (string)
                list[list[str]]
                or words is [word, score] if with_scores
        """
        _debug = debug

        def debug(*args, **kwargs):
            if _debug:
                print(*args, **kwargs)

        def maybe_lower(t):
            return t.lower() if lower else t

        if not with_scores:
            # note replacing spe quote only needed for french for aujourd'hui
            ann = self.annotate(" ".join(words).replace("’", "'"))

            sentences = [[maybe_lower(token.word) for token in sentence.token]
                         for sentence in ann.sentence]
            sentences = [
                s for s in sentences
                if no_minimum or len(s) >= self.sentence_min_length
            ]
            return sentences
        else:
            words, scores = zip(*[(w, s) for w, s in words])
            debug(words)
            debug(scores)
            text = " ".join(words)
            ann = self.annotate(text.replace("’", "'"))

            count = 0
            sentences = []
            prev_word = ""
            prev_score = ""
            word_done = True
            wip = ""
            for sentence in ann.sentence:
                sentences.append([])
                for token in sentence.token:
                    debug("'%s' ~= '%s'" % (words[count], token.word))
                    wip += token.word
                    score = scores[count]
                    if not wip == words[count]:
                        debug("Incomplete word, wip='%s'" % wip)
                    else:
                        wip = ""
                        count += 1
                    # if not token.word == words[count]:
                    #     if word_done and words[count].startswith(token.word):
                    #         word_done = False
                    #         debug("Incomplete word: begining")
                    #     elif words[count].endswith(token.word):
                    #         debug("Incomplete word: end")
                    #         word_done = True
                    #         count += 1
                    #     elif token.word in words[count]:
                    #         debug("Incomplete word: middle")
                    #         pass
                    #     else:
                    #         raise ValueError("mismatch '%s' and '%s'" % (words[count], token.word))
                    # else:
                    #     count += 1
                    sentences[-1].append([maybe_lower(token.word), score])

            def to_old_style(s):
                return " ".join([w[0] for w, _ in s]).replace("' ",
                                                              "'").split()

            sentences = [
                s for s in sentences if no_minimum
                or len(to_old_style(s)) >= self.sentence_min_length
            ]

            assert count == len(scores) == len(words)
            return sentences

    def flatten_document(self, document, implicit_nom=False, exclude_toc=True):
        """

        Args:
            document(docx.Document)

        Returns:
            sections(list[section]) with:
                section: list[sentence]
                sentence: list[word(str)]
                finally sections is list[list[list[word(str)]]]

                NOTE: now word is actually [word, score]

        """
        implicit_nom_file = open('implicit_nom.lst_', 'a')
        unique_noms = set()

        cur_section = []
        sections = [cur_section]
        cur_lvl = -1
        style_error_p = []
        cur_txt_len = 0

        last_tag = TAGLESS

        # for p in document.paragraphs:
        for elmt in docx_iter(document):
            tag = TAGLESS

            if is_p_elmt(elmt):
                self.log("p elmt, style=%s" % elmt.style)
                try:
                    style = elmt.style
                    if exclude_toc and self.is_toc(style):
                        continue

                    lvl = self.heading_idx(style)

                    if lvl == self.default_heading_idx:
                        tag = TAG_P
                    else:
                        tag = TAG_H

                    if self.is_name(style):
                        tag = TAG_NAME

                except AttributeError as e:
                    style_error_p += [elmt]
                    lvl = self.default_heading_idx
                    tag = TAG_P
                    raise e
            else:
                if is_tbl_elmt(elmt):
                    tag = TAG_TABLE
                elif is_row_elmt(elmt):
                    tag = TAG_ROW

                lvl = self.default_heading_idx

            self.log("lvl: %d" % lvl)

            # sentences = list[list[str]]
            # words = list[str]
            words = elmt2txt(elmt).split()
            self.log("'%s'\n" % [_.lower() for _ in words[:150]])

            no_minimum = True
            sentences = self.get_sentences(words,
                                           no_minimum=no_minimum,
                                           with_scores=False)
            if implicit_nom:
                assert self.interventions

                def _add_implicit_nom(nom):
                    __nom = " ".join(nom)
                    if not __nom in unique_noms:
                        print(__nom, file=implicit_nom_file)
                        unique_noms.add(__nom)

                if len(sentences) > 0:
                    s = " ".join(sentences[0])

                    if "--" in sentences[0]:
                        # <nom> -- intervention
                        count = sentences[0].count("--")
                        pos = sentences[0].index("--")

                        if count == 1 and pos < 7 and len(sentences) > 1:
                            nom = s.split("--")[0].split()
                            intervention = " ".join(s.split("--")[1:]).split()
                            _nom = ["<nom>"] + nom + ["</nom>"]
                            _intervention = ["<%s>" % tag] + intervention
                            sentences[-1].append("</%s>" % tag)
                            _sentences = [_nom, _intervention] + sentences[1:]
                            cur_section = _sentences
                            sections.append(cur_section)

                            _add_implicit_nom(nom)
                            continue
                    elif len(sentences[0]) < 5 and any([
                            s.lower().startswith(_) for _ in
                        ["monsieur", "madame", "m.", "mme.", "mr."]
                    ]):
                        # <monsieur|madame|..> <nom> \n text
                        # print("monsieur|madame detected")
                        nom = sentences[0]
                        _nom = ["<nom>"] + nom + ["</nom>"]
                        _sentences = [_nom]
                        if len(sentences) > 1:
                            sentences[1] = ["<%s>" % tag] + sentences[1]
                            sentences[-1] = sentences[-1] + ["</%s>" % tag]
                            _sentences += sentences[1:]

                        cur_section = _sentences
                        sections.append(cur_section)

                        _add_implicit_nom(nom)
                        continue

            words = flatten_list(sentences)
            cur_txt_len += len(words)

            if self.use_tags:
                if not len(sentences) > 0:
                    sentences = [[]]
                    # continue

                sentences[0] = ["<%s>" % tag] + sentences[0]
                sentences[-1] = sentences[-1] + ["</%s>" % tag]
            if "".join(words) == "e-customer":
                words = ["e", "-", "customer"]

            if len(words) == 0:
                continue
            if len(words) < self.paragraph_min_word_length:
                if not tag in [TAG_NAME, TAG_H]:
                    continue
            if self.interventions:
                if tag in [TAG_NAME, TAG_H] and last_tag != TAG_H:
                    cur_section = sentences
                    sections += [cur_section]
                else:
                    cur_section += sentences
            else:
                # sections mode:
                if lvl <= cur_lvl:
                    # new section
                    if cur_txt_len > 0:
                        cur_section = sentences
                        sections += [cur_section]
                        cur_txt_len = 0
                    cur_lvl = lvl
                else:
                    # appending content to current section
                    cur_section += sentences

                    if lvl < self.n_headings:
                        # only updates the level in case of header
                        cur_lvl = lvl
            last_tag = tag
        return sections

    def log(self, *args, **kwargs):
        if self.verbose:
            print(*args, **kwargs)

    def process_docx(self, docx_path, implicit_nom=False, verbose=False):
        """
        Args:
            docx_path(str)

        Returns:
            sentences(list[str])
            slices(list[slice])
        """
        document = docx.Document(docx_path)
        # structure = self.get_docx_structure(document)
        # flat_structure = self.flatten_section(structure)
        sections = self.flatten_document(document, implicit_nom=implicit_nom)

        sentences = flatten_list(sections)
        if self.sentence_slices and not self.interventions:
            slices = [slice(i, i + 1) for i in range(len(sentences))]
        else:
            slices = []
            lower = 0
            for section in sections:
                if len(section) == 0:
                    continue
                upper = lower + len(section)
                slices += [slice(lower, upper)]
                lower = upper

        return sentences, slices

    def process_ctm(self, ctm_paths, get_scores=False, debug=False):
        """
        Args:
            ctm_paths(list[string])

        Returns:
            ctm_slices(list[slice])
            ctm_sentences(list[string])
        """
        paroles = []
        cur_sentence = ""
        ctm_sentences = []
        # 1. CTM ->> List of paroles
        for ctm_path in ctm_paths:
            with open(ctm_path, 'rb') as f_ctm:
                for line in f_ctm:
                    try:
                        word, score = line.decode('utf-8').split("\t")[4:6]
                        score = float(score)
                    except UnicodeDecodeError as e:
                        print("UnicodeDecodeError on file '%s'" % ctm_path)
                        raise e

                    if word.startswith("<start="):
                        if len(paroles) == 0 or len(paroles[-1]) > 0:
                            paroles.append([])
                        continue
                    paroles[-1].append([word, score])

        # 2. List of paroles ->> List of lists of sentences (and ranges)
        if self.sentence_slices:
            assert not get_scores, "Not implemented"
            ctm_sentences = []
            for parole in paroles:
                sentences = [
                    _ for _ in self.get_sentences(parole, with_scores=True)
                    if len(_) > 0
                ]
                ctm_sentences += sentences

            parole_slices = [
                slice(i, i + 1) for i in range(len(ctm_sentences))
            ]
        else:
            ctm_sentences = []
            parole_slices = []
            lower = 0
            for i_parole in range(len(paroles)):
                parole = paroles[i_parole]
                self.log("***")
                self.log(parole)
                ## ann = client.annotate(parole)
                # sentences = [[token.word.lower()
                # for token in sentence.token]
                # for sentence in ann.sentence]
                sentences = self.get_sentences(parole,
                                               with_scores=True,
                                               debug=debug)

                if len(sentences) == 0:
                    continue
                self.log(sentences)
                ctm_sentences += sentences
                upper = lower + len(sentences)
                parole_slices.append(slice(lower, upper))
                lower = upper

        return ctm_sentences, parole_slices
コード例 #12
0
from graph import Graph
import pickle

logging.getLogger("transformers.tokenization_utils").setLevel(logging.WARNING)

install_dir = Path('~/stanfordnlp_resources/').expanduser()
text = 'Barack Obama was born in Hawaii. He  wrote this sentence.'
os.environ['CORENLP_HOME'] = str(install_dir /
                                 'stanford-corenlp-full-2018-10-05')
properties = {}
from stanfordnlp.server import CoreNLPClient

client = CoreNLPClient(annotators=['openie'],
                       memory='6G',
                       properties=properties)
client.annotate(text="time pass", annotators=['openie'], output_format='json')
nlp = spacy.load('en')

coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')
nlplem = spacy.load('en')

SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {
    'bos_token': '<bos>',
    'eos_token': '<eos>',
    'pad_token': '<pad>',
    'additional_special_tokens': ['<speaker1>', '<speaker2>']
}
MODEL_INPUTS = [
    "input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"
コード例 #13
0
# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)

!ps -o pid,cmd | grep java

from google.colab import drive
drive.mount('/content/gdrive')

with open('/content/gdrive/My Drive/Colab Notebooks/chapter1.txt', 'r') as file:
  data = file.read().replace('\n', '')

#data = "Such were some of various omens. Emperor Ling, greatly moved by these signs of the displeasure of Heaven, issued an edict asking his ministers for an explanation of the calamities and marvels."

document = client.annotate(data)
print(type(document))

# Iterate over all detected entity mentions
print("{:30s}\t{}".format("Mention", "Type"))
listofchar = []
for sent in document.sentence:
    for m in sent.mentions:
        #print(type(m.entityType))
        #print("{:30s}\t{}".format(m.entityMentionText, m.entityType))
        if m.entityType[0] is "P":
          if m.entityMentionText not in listofchar:
            listofchar.append(m.entityMentionText)

#!pip install pycorenlp
#from pycorenlp import StanfordCoreNLP
コード例 #14
0
properties = {'ner.model': './stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz'}
'''

# In[ ]:

os.environ['CORENLP_HOME'] = './stanford-corenlp-full-2018-10-05'
properties = {
    'ner.model':
    './stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz,'
    './stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz,'
    './stanford-ner-2018-10-16/classifiers/english.conll.4class.distsim.crf.ser.gz'
}
client = CoreNLPClient(annotators=['tokenize', 'pos', 'lemma', 'ner'],
                       memory='8g',
                       endpoint='http://localhost:9001')
doc = client.annotate(text)
for sent in doc.sentence:
    for m in sent.mentions:
        print(m.entityMentionText, '\t\t\t', m.entityType)
client.stop()  ## do not forget to stop the client

# In[ ]:

# In[ ]:

## nltk
nltk.download()  # d-punkt-q
st = StanfordNERTagger(
    'stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
    'stanford-ner-2018-10-16/stanford-ner.jar',
    encoding='utf-8')
コード例 #15
0
class Tokenizer:
    def __init__(self) -> None:
        os.environ[
            'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format(
                os.environ['HOME'])
        self.client = CoreNLPClient()
        self.client.ensure_alive()
        self.do_lower_case = '-cased' not in config.bert_model
        self.basic_tokenizer: BasicTokenizer \
            = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer

    def __del__(self) -> None:
        for p in glob.glob('corenlp_server-*.props'):
            if os.path.isfile(p):
                os.remove(p)

    def tokenize(self, doc: str) -> List[Sentence]:
        splitter_annotation \
            = self.client.annotate(doc, annotators=['ssplit'],
                                   properties={'tokenize.options': 'ptb3Escaping=false,invertible=true'})
        end = 0
        sentences = []
        for sentence in splitter_annotation.sentence:
            begin = doc.index(sentence.token[0].originalText, end)
            for token in sentence.token:
                end = doc.index(token.originalText, end) + len(
                    token.originalText)
            text = doc[begin:end]
            sentences.append(Sentence(text, begin, end))
        sentences = self.fix_split(sentences)
        for sentence in sentences:
            text = sentence.text
            if self.do_lower_case:
                text = text.lower()
            bert_tokens = self.basic_tokenizer.tokenize(text)
            end = 0
            tokens = []
            for bert_token in bert_tokens:
                word = bert_token
                begin = text.index(word, end)
                end = begin + len(word)
                tokens.append(
                    Token(word, sentence.begin + begin, sentence.begin + end))
            assert len(tokens) > 0
            sentence.tokens = tokens
        return sentences

    @staticmethod
    def fix_split(sentences: List[Sentence]) -> List[Sentence]:
        result = []
        i = 0
        while i < len(sentences):
            sentence = sentences[i]
            while True:
                next_sentence = sentences[
                    i + 1] if i < len(sentences) - 1 else None
                if '\n\n' in sentence.text:
                    index = sentence.text.index('\n\n')
                    new_sentence = Sentence(sentence.text[:index],
                                            sentence.begin,
                                            sentence.begin + index)
                    result.append(new_sentence)
                    index += re.search(r'[\n\t ]+',
                                       sentence.text[index:]).end()
                    sentence.text = sentence.text[index:]
                    sentence.begin += index
                elif next_sentence is not None and next_sentence.begin == sentence.end:
                    sentence.text += next_sentence.text
                    sentence.end = next_sentence.end
                    i += 1
                else:
                    result.append(sentence)
                    break
            i += 1
        return result
コード例 #16
0
class pos_tagger():
    """
    Class to impement part of speech tagging (pos tagging)
    """
    def __init__(self, tagger='spacy', language='french'):
        self.tagger = tagger
        self.tagmodule = None
        self.tagset = UTagSet  # TAG Set by default
        self.language = language
        spacy_module = {
            'french': 'fr_core_news_sm',
            'english': 'en_core_web_sm'
        }

        if tagger == 'spacy':
            self.tagger = self.spacy_pos_tag
            self.tagset = UDTagSet
            try:
                self.tagmodule = spacy.load(spacy_module[language])
            except:
                logger.warning(
                    'Module for language [{:s}] not installed for Spacy - using french by default'
                    .format(language))
                self.tagmodule = spacy.load(spacy_module['french'])
        elif tagger == 'stanford':
            self.tagger = self.stanford_pos_tag
            self.tagset = FTTagSet
            JAVAHOME = "C:/Program Files (x86)/Java/jre1.8.0_241/bin/java.exe"
            # Set a JAVAHOME environment variable if not present
            if not 'JAVAHOME' in os.environ:
                os.environ['JAVAHOME'] = JAVAHOME
            root_path = "./stanford-postagger/"  # location of Stanford POS Tagger components

            # Launch the Stanford Pos Tagger (implemented in Java)
            self.tagmodule = StanfordPOSTagger(
                root_path + "models/" + language + ".tagger",
                root_path + "stanford-postagger.jar",
                encoding='utf8')
        elif tagger == 'core_nlp':
            self.tagger = self.corenlp_pos_tag
            os.environ['CORENLP_HOME'] = './stanford-corenlp-full-2018-10-05'
            try:
                self.tagmodule = CoreNLPClient(properties=language,
                                               annotators=[
                                                   'pos',
                                               ],
                                               timeout=30000,
                                               memory='1G')
            except:
                logger.warning(
                    'Could not launch Stanford Core NLP for [{:s}]'.format(
                        language))
        elif tagger == 'nltk':
            self.tagger = self.nltk_pos_tag
            self.tagset = NLTKTagSet
            if language != 'english':
                logger.warning(
                    'nltk does not support [{:s}] language'.format(language))
        else:
            logger.warning('POS tagger [{:s}] unknown'.format(tagger))

    def pos_tag(self, sentence):
        assert (self.tagger)
        return self.tagger(sentence)

    def spacy_pos_tag(self, sentence):
        assert (self.tagmodule)
        return [(token.text, token.pos_) for token in self.tagmodule(sentence)]

    def stanford_pos_tag(self, sentence):
        assert (self.tagmodule)
        return self.tagmodule.tag(nltk.word_tokenize(sentence))

    def corenlp_pos_tag(self, sentence):
        # Unchecked
        # DOes not seem to work
        assert (self.tagmodule)
        ann = self.tagmodule.annotate(sentence)
        return [(token.word, token.pos) for token in ann.sentence[0].token]

    def nltk_pos_tag(self, sentence):
        return nltk.pos_tag(nltk.word_tokenize(sentence))

    def tag_label(self, tag):
        return self.tagset.get(tag, '??')