コード例 #1
0
ファイル: wiki_online.py プロジェクト: neuroph12/intel_nlp
 def __init__(self):
     import pywikibot
     self.spacy = SpacyInstance()
     self.pywikibot = pywikibot
     self.cache = dict()
     self.site = pywikibot.Site(
         'en', 'wikipedia')  # The site we want to run our bot on
コード例 #2
0
    def __init__(self, params_dict, embeddings):
        """
        Args:
            params_dict: Dictionary containing the following keys-
                         'max_question' : max length of all questions in the dataset
                         'max_para' :  max length of all paragraphs in the dataset
                         'hidden_size': number of hidden units in the network
                         'batch_size' : batch size defined by user

            embeddings: Glove pretrained embedding matrix
        """

        # Assign Variables:
        self.max_question = params_dict['max_question']
        self.max_para = params_dict['max_para']
        self.hidden_size = params_dict['hidden_size']
        self.batch_size = params_dict['batch_size']
        self.embeddings = embeddings
        self.inference_only = params_dict['inference_only']
        self.G_i = None
        self.attn = None
        self.stacked_lists_forward = None
        self.stacked_lists_reverse = None
        self.logits_withsf = None

        # init tokenizer
        self.tokenizer = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])

        # Create Placeholders
        # Question ids
        self.question_ids = tf.placeholder(tf.int32, shape=[None, self.max_question],
                                           name="question_ids")
        # Paragraph ids
        self.para_ids = tf.placeholder(tf.int32, shape=[None, self.max_para],
                                       name="para_ids")
        # Length of question
        self.question_length = tf.placeholder(tf.int32, shape=[None],
                                              name="question_len")
        # Length of paragraph
        self.para_length = tf.placeholder(tf.int32, shape=[None],
                                          name="para_len")
        # Mask for paragraph
        self.para_mask = tf.placeholder(tf.float32, shape=[None, self.max_para],
                                        name="para_mask")
        # Mask for question
        self.ques_mask = tf.placeholder(tf.float32, shape=[None, self.max_question],
                                        name="ques_mask")
        # Answer spans
        if self.inference_only is False:
            self.labels = tf.placeholder(tf.int32, shape=[None, 2], name="labels")
        # Dropout value
        self.dropout = tf.placeholder(tf.float32, shape=[], name="dropout")
        self.global_step = tf.Variable(0, name='global')

        # Get variables
        self.create_variables()

        # Define model
        self.create_model()
コード例 #3
0
ファイル: ner_api.py プロジェクト: xiaming9880/nlp-architect
 def __init__(self, prompt=True):
     self.model = None
     self.model_info = None
     self.word_vocab = None
     self.y_vocab = None
     self.char_vocab = None
     self._download_pretrained_model(prompt)
     self.nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
コード例 #4
0
 def __init__(self, prompt=False):
     self.model = None
     self.model_info = None
     self.word_vocab = None
     self.y_vocab = None
     self.char_vocab = None
     self._download_pretrained_model(prompt)
     self.nlp = SpacyInstance(disable=["tagger", "ner", "parser", "vectors", "textcat"])
コード例 #5
0
def process_inference_input(input_file):
    with io.open(input_file) as fp:
        texts = [l.strip() for l in fp.readlines()]
    tokenizer = SpacyInstance(disable=["tagger", "parser", "ner"])
    examples = []
    for i, t in enumerate(texts):
        examples.append(TokenClsInputExample(str(i), t, tokenizer.tokenize(t)))
    return examples
コード例 #6
0
 def __init__(self, prompt=True):
     self.model = None
     self.model_type = None
     self.word_vocab = None
     self.tags_vocab = None
     self.char_vocab = None
     self.intent_vocab = None
     self._download_pretrained_model(prompt)
     self.nlp = SpacyInstance(
         disable=["tagger", "ner", "parser", "vectors", "textcat"])
コード例 #7
0
ファイル: prepare_data.py プロジェクト: yuansky/nlp-architect
def load_parser(chunker):
    # load spacy parser
    logger.info("loading spacy. chunker=%s", chunker)
    if "nlp_arch" in chunker:
        parser = SpacyInstance(model="en_core_web_sm",
                               disable=["textcat", "ner", "parser"]).parser
        parser.add_pipe(parser.create_pipe("sentencizer"), first=True)
        _path_to_model = path.join(chunker_path, chunker_model_file)
        _path_to_params = path.join(chunker_path, chunker_model_dat_file)
        if not path.exists(chunker_path):
            makedirs(chunker_path)
        if not path.exists(_path_to_model):
            logger.info(
                "The pre-trained model to be downloaded for NLP Architect"
                " word chunker model is licensed under Apache 2.0")
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
        parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                        last=True)
    else:
        parser = SpacyInstance(model="en_core_web_sm",
                               disable=["textcat", "ner"]).parser
    logger.info("spacy loaded")
    return parser
コード例 #8
0
ファイル: prepare_data.py プロジェクト: wangjs/nlp-architect
def load_parser(chunker):
    # load spacy parser
    logger.info('loading spacy. chunker=%s', chunker)
    if 'nlp_arch' in chunker:
        parser = SpacyInstance(model='en_core_web_sm',
                               disable=['textcat', 'ner', 'parser']).parser
        parser.add_pipe(parser.create_pipe('sentencizer'), first=True)
        _path_to_model = path.join(chunker_path, chunker_model_file)
        _path_to_params = path.join(chunker_path, chunker_model_dat_file)
        if not path.exists(chunker_path):
            makedirs(chunker_path)
        if not path.exists(_path_to_model):
            logger.info(
                'The pre-trained model to be downloaded for NLP Architect'
                ' word chunker model is licensed under Apache 2.0')
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
        parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                        last=True)
    else:
        parser = SpacyInstance(model='en_core_web_sm',
                               disable=['textcat', 'ner']).parser
    logger.info('spacy loaded')
    return parser
コード例 #9
0
ファイル: wiki_online.py プロジェクト: yyzreal/nlp-architect
 def __init__(self):
     try:
         import pywikibot
     except (AttributeError, ImportError):
         logger.error(
             "pywikibot is not installed, please install nlp_architect with [all] package. "
             + "for example: pip install nlp_architect[all]")
         sys.exit()
     self.spacy = SpacyInstance()
     self.pywikibot = pywikibot
     self.cache = dict()
     self.site = pywikibot.Site(
         "en", "wikipedia")  # The site we want to run our bot on
コード例 #10
0
 def _parse_json(self, data):
     tok = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
     sentences = []
     for s in data:
         tokens = []
         tags = []
         for t in s:
             new_tokens = tok.tokenize(t['text'].strip())
             tokens += new_tokens
             ent = t.get('entity', None)
             if ent is not None:
                 tags += self._create_tags(ent, len(new_tokens))
             else:
                 tags += ['O'] * len(new_tokens)
         sentences.append((tokens, tags))
     return sentences
コード例 #11
0
 def _parse_json(self, data):
     tok = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
     sentences = []
     for s in data:
         tokens = []
         tags = []
         for t in s:
             new_tokens = tok.tokenize(t['text'].strip())
             tokens += new_tokens
             ent = t.get('entity', None)
             if ent is not None:
                 tags += self._create_tags(ent, len(new_tokens))
             else:
                 tags += ['O'] * len(new_tokens)
         sentences.append((tokens, tags))
     return sentences
コード例 #12
0
 def _parse_json(self, data):
     tok = SpacyInstance(
         disable=["tagger", "ner", "parser", "vectors", "textcat"])
     sentences = []
     for s in data:
         tokens = []
         tags = []
         for t in s:
             new_tokens = tok.tokenize(t["text"].strip())
             tokens += new_tokens
             ent = t.get("entity", None)
             if ent is not None:
                 tags += self._create_tags(ent, len(new_tokens))
             else:
                 tags += ["O"] * len(new_tokens)
         sentences.append((tokens, tags))
     return sentences
コード例 #13
0
def test_np_annotator_linked(model_path, settings_path, text, phrases):
    annotator = SpacyInstance(model="en", disable=["textcat", "ner", "parser"]).parser
    annotator.add_pipe(annotator.create_pipe("sentencizer"), first=True)
    annotator.add_pipe(NPAnnotator.load(model_path, settings_path), last=True)
    doc = annotator(text)
    noun_phrases = [p.text for p in get_noun_phrases(doc)]
    for p in phrases:
        assert p in noun_phrases
コード例 #14
0
ファイル: np_scorer.py プロジェクト: neuroph12/intel_nlp
    def __init__(self, parser=None):
        if parser is None:
            self.nlp = SpacyInstance(
                disable=['ner', 'parser', 'vectors', 'textcat']).parser
        else:
            self.nlp = parser

        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'), first=True)
        _path_to_model = path.join(chunker_local_path, chunker_model_file)
        if not path.exists(chunker_local_path):
            makedirs(chunker_local_path)
        if not path.exists(_path_to_model):
            logger.info(
                'The pre-trained model to be downloaded for NLP Architect word'
                ' chunker model is licensed under Apache 2.0')
            download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model)
        _path_to_params = path.join(chunker_local_path, chunker_model_dat_file)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params)
        self.nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True)
コード例 #15
0
ファイル: inference.py プロジェクト: yuansky/nlp-architect
    def __init__(
        self,
        aspect_lex: Union[str, PathLike],
        opinion_lex: Union[str, PathLike, dict],
        parse: bool = True,
        parser="spacy",
        spacy_model="en_core_web_sm",
    ):
        """Inits SentimentInference with given aspect and opinion lexicons."""
        INFERENCE_OUT.mkdir(parents=True, exist_ok=True)
        self.opinion_lex = (
            opinion_lex if type(opinion_lex) is dict else load_opinion_lex(Path(opinion_lex))
        )
        self.aspect_lex = _load_aspect_lexicon(Path(aspect_lex))
        self.intensifier_lex = _read_lexicon_from_csv("IntensifiersLex.csv")
        self.negation_lex = _read_lexicon_from_csv("NegationSentLex.csv")
        self.parser_name = parser

        if parse:
            if parser == "bist":
                from nlp_architect.pipelines.spacy_bist import SpacyBISTParser

                self.parser = SpacyBISTParser(spacy_model=spacy_model)
            elif parser == "spacy":
                from nlp_architect.utils.text import SpacyInstance

                disable = [
                    "merge_noun_chunks",
                    "ner",
                    "entity_linker",
                    "textcat",
                    "entity_ruler",
                    "sentencizer",
                    "merge_entities",
                ]
                self.parser = SpacyInstance(
                    model=spacy_model, disable=disable, ptb_pos=True, n_jobs=1
                )
        else:
            self.parser = None
コード例 #16
0
    def __init__(self, verbose=False, spacy_model="en", bist_model=None):
        validate(
            (verbose, bool), (spacy_model, str, 0, 1000), (bist_model, (type(None), str), 0, 1000)
        )
        if not bist_model:
            print("Using pre-trained BIST model.")
            _download_pretrained_model()
            bist_model = SpacyBISTParser._pretrained

        self.verbose = verbose
        self.bist_parser = BISTModel()
        self.bist_parser.load(bist_model if bist_model else SpacyBISTParser._pretrained)
        self.spacy_parser = SpacyInstance(spacy_model, disable=["ner", "vectors", "textcat"]).parser
コード例 #17
0
class StringUtils:
    spacy_no_parser = SpacyInstance(disable=['parser'])
    spacy_parser = SpacyInstance()
    stop_words = None
    pronouns = None
    preposition = None

    def __init__(self):
        pass

    @staticmethod
    def is_stop(token: str) -> bool:
        if not StringUtils.stop_words:
            StringUtils.stop_words = load_json_file(STOP_WORDS_FILE)
            StringUtils.stop_words.extend(DISAMBIGUATION_CATEGORY)
        if token not in StringUtils.stop_words:
            return False
        return True

    @staticmethod
    def normalize_str(in_str: str) -> str:
        str_clean = re.sub('[' + string.punctuation + string.whitespace + ']',
                           ' ', in_str).strip().lower()
        if isinstance(str_clean, str):
            str_clean = str(str_clean)

        doc = StringUtils.spacy_no_parser.parser(str_clean)
        ret_clean = []
        for token in doc:
            lemma = token.lemma_.strip()
            if not StringUtils.is_pronoun(lemma) and not StringUtils.is_stop(
                    lemma):
                ret_clean.append(token.lemma_)

        return ' '.join(ret_clean)

    @staticmethod
    def is_pronoun(in_str: str) -> bool:
        if not StringUtils.pronouns:
            StringUtils.pronouns = load_json_file(PRONOUN_FILE)

        tokens = in_str.split()
        if len(tokens) == 1:
            if tokens[0] in StringUtils.pronouns:
                return True
        return False

    @staticmethod
    def is_preposition(in_str: str) -> bool:
        if not StringUtils.preposition:
            StringUtils.preposition = load_json_file(PREPOSITION_FILE)

        tokens = in_str.split()
        if len(tokens) == 1:
            if tokens[0] in StringUtils.preposition:
                return True
        return False

    @staticmethod
    def normalize_string_list(str_list: str) -> List[str]:
        ret_list = []
        for _str in str_list:
            normalize_str = StringUtils.normalize_str(_str)
            if normalize_str != '':
                ret_list.append(normalize_str)
        return ret_list

    @staticmethod
    def find_head_lemma_pos_ner(x: str):
        '''

        :param x: mention
        :return: the head word and the head word lemma of the mention
        '''
        head = None
        lemma = None
        pos = None
        ner = None
        doc = StringUtils.spacy_parser.parser(x)
        for tok in doc:
            if tok.head == tok:
                head = tok.text
                lemma = tok.lemma_
                pos = tok.pos_
        for ent in doc.ents:
            if ent.root.text == head:
                ner = ent.label_

        return head, lemma, pos, ner
コード例 #18
0
def initiate_parser():
    return SpacyInstance(
        disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']).parser
コード例 #19
0
ファイル: ner_api.py プロジェクト: neuroph12/intel_nlp
class NerApi(AbstractApi):
    """
    NER model API
    """
    model_dir = path.join(LIBRARY_STORAGE_PATH, 'ner-pretrained')
    pretrained_model = path.join(model_dir, 'model.h5')
    pretrained_model_info = path.join(model_dir, 'model_info.dat')

    def __init__(self, prompt=True):
        self.model = None
        self.model_info = None
        self.word_vocab = None
        self.y_vocab = None
        self.char_vocab = None
        self._download_pretrained_model(prompt)
        self.nlp = SpacyInstance(
            disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])

    @staticmethod
    def _prompt():
        response = input(
            '\nTo download \'{}\', please enter YES: '.format('ner'))
        res = response.lower().strip()
        if res == "yes" or (len(res) == 1 and res == 'y'):
            print('Downloading {}...'.format('ner'))
            responded_yes = True
        else:
            print('Download declined. Response received {} != YES|Y. '.format(
                res))
            responded_yes = False
        return responded_yes

    def _download_pretrained_model(self, prompt=True):
        """Downloads the pre-trained BIST model if non-existent."""
        model_exists = path.isfile(self.pretrained_model)
        model_info_exists = path.isfile(self.pretrained_model_info)
        if not model_exists or not model_info_exists:
            print(
                'The pre-trained models to be downloaded for the NER dataset '
                'are licensed under Apache 2.0. By downloading, you accept the terms '
                'and conditions provided by the license')
            makedirs(self.model_dir, exist_ok=True)
            if prompt is True:
                agreed = NerApi._prompt()
                if agreed is False:
                    sys.exit(0)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/',
                'model.h5', self.pretrained_model)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/ner/',
                'model_info.dat', self.pretrained_model_info)
            print('Done.')

    def load_model(self):
        self.model = NERCRF()
        self.model.load(self.pretrained_model)
        with open(self.pretrained_model_info, 'rb') as fp:
            model_info = pickle.load(fp)
        self.word_vocab = model_info['word_vocab']
        self.y_vocab = {v: k for k, v in model_info['y_vocab'].items()}
        self.char_vocab = model_info['char_vocab']

    @staticmethod
    def pretty_print(text, tags):
        spans = []
        for s, e, tag in bio_to_spans(text, tags):
            spans.append({'start': s, 'end': e, 'type': tag})
        ents = dict((obj['type'].lower(), obj) for obj in spans).keys()
        ret = {
            'doc_text': ' '.join(text),
            'annotation_set': list(ents),
            'spans': spans,
            'title': 'None'
        }
        print({"doc": ret, 'type': 'high_level'})
        return {"doc": ret, 'type': 'high_level'}

    def process_text(self, text):
        input_text = ' '.join(text.strip().split())
        return self.nlp.tokenize(input_text)

    def vectorize(self, doc, vocab, char_vocab):
        words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc]) \
            .reshape(1, -1)
        sentence_chars = []
        for w in doc:
            word_chars = []
            for c in w:
                if c in char_vocab:
                    _cid = char_vocab[c]
                else:
                    _cid = 1
                word_chars.append(_cid)
            sentence_chars.append(word_chars)
        sentence_chars = np.expand_dims(pad_sentences(sentence_chars,
                                                      self.model.word_length),
                                        axis=0)
        return words, sentence_chars

    def inference(self, doc):
        text_arr = self.process_text(doc)
        doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab)
        seq_len = np.array([len(text_arr)]).reshape(-1, 1)
        inputs = list(doc_vec)
        if self.model.crf_mode == 'pad':
            inputs = list(doc_vec) + [seq_len]
        doc_ner = self.model.predict(inputs, batch_size=1).argmax(2).flatten()
        tags = [self.y_vocab.get(n, None) for n in doc_ner]
        return self.pretty_print(text_arr, tags)
コード例 #20
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import pickle
from os import makedirs, path, sys

import numpy as np

from nlp_architect.api.abstract_api import AbstractApi
from nlp_architect.models.ner_crf import NERCRF
from nlp_architect.utils.generic import pad_sentences
from nlp_architect.utils.io import download_unlicensed_file
from nlp_architect.utils.text import SpacyInstance

nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])


class NerApi(AbstractApi):
    """
    NER model API
    """
    dir = path.dirname(path.realpath(__file__))
    pretrained_model = path.join(dir, 'ner-pretrained', 'model.h5')
    pretrained_model_info = path.join(dir, 'ner-pretrained', 'model_info.dat')

    def __init__(self, prompt=True):
        self.model = None
        self.model_info = None
        self.model_path = NerApi.pretrained_model
        self.model_info_path = NerApi.pretrained_model_info
コード例 #21
0
# limitations under the License.
# ******************************************************************************

from __future__ import division, print_function, unicode_literals, absolute_import

import argparse
import pickle

import numpy as np

from nlp_architect.models.ner_crf import NERCRF
from nlp_architect.utils.generic import pad_sentences
from nlp_architect.utils.io import validate_existing_filepath
from nlp_architect.utils.text import SpacyInstance

nlp = SpacyInstance(disable=["tagger", "ner", "parser", "vectors", "textcat"])


def read_input_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_path", type=validate_existing_filepath, required=True, help="Path of model weights"
    )
    parser.add_argument(
        "--model_info_path",
        type=validate_existing_filepath,
        required=True,
        help="Path of model topology",
    )
    input_args = parser.parse_args()
    return input_args
コード例 #22
0
ファイル: inference.py プロジェクト: yuansky/nlp-architect
class SentimentInference:
    """Main class for sentiment inference execution.

    Attributes:
        opinion_lex: Opinion lexicon as outputted by TrainSentiment module.
        aspect_lex: Aspect lexicon as outputted by TrainSentiment module.
        intensifier_lex (dict): Pre-defined intensifier lexicon.
        negation_lex (dict): Pre-defined negation lexicon.
    """

    def __init__(
        self,
        aspect_lex: Union[str, PathLike],
        opinion_lex: Union[str, PathLike, dict],
        parse: bool = True,
        parser="spacy",
        spacy_model="en_core_web_sm",
    ):
        """Inits SentimentInference with given aspect and opinion lexicons."""
        INFERENCE_OUT.mkdir(parents=True, exist_ok=True)
        self.opinion_lex = (
            opinion_lex if type(opinion_lex) is dict else load_opinion_lex(Path(opinion_lex))
        )
        self.aspect_lex = _load_aspect_lexicon(Path(aspect_lex))
        self.intensifier_lex = _read_lexicon_from_csv("IntensifiersLex.csv")
        self.negation_lex = _read_lexicon_from_csv("NegationSentLex.csv")
        self.parser_name = parser

        if parse:
            if parser == "bist":
                from nlp_architect.pipelines.spacy_bist import SpacyBISTParser

                self.parser = SpacyBISTParser(spacy_model=spacy_model)
            elif parser == "spacy":
                from nlp_architect.utils.text import SpacyInstance

                disable = [
                    "merge_noun_chunks",
                    "ner",
                    "entity_linker",
                    "textcat",
                    "entity_ruler",
                    "sentencizer",
                    "merge_entities",
                ]
                self.parser = SpacyInstance(
                    model=spacy_model, disable=disable, ptb_pos=True, n_jobs=1
                )
        else:
            self.parser = None

    def parse_data(self, data: Union[PathLike, PosixPath], out_dir: Union[str, PathLike]):
        if out_dir:
            Path(out_dir).mkdir(parents=True, exist_ok=True)
        parse_func = parse_docs_bist if self.parser_name == "bist" else parse_docs
        parse_func(self.parser, data, out_dir=out_dir)
        return out_dir

    def run(self, doc: str = None, parsed_doc: CoreNLPDoc = None) -> SentimentDoc:
        """Run SentimentInference on a single document.

        Returns:
            The sentiment annotated document, which contains the detected events per sentence.
        """
        if not parsed_doc:
            if not self.parser:
                raise RuntimeError("Parser not initialized (try parse=True at init)")
            parsed_doc = self.parser.parse([doc])[0]

        sentiment_doc = None
        for sentence in parsed_doc.sentences:
            events = []
            scores = []
            for aspect_row in self.aspect_lex:
                _, asp_events = self._extract_event(aspect_row, sentence)
                for asp_event in asp_events:
                    events.append(asp_event)
                    scores += [term.score for term in asp_event if term.type == TermType.ASPECT]

            if events:
                if not sentiment_doc:
                    sentiment_doc = SentimentDoc(parsed_doc.doc_text)
                sentiment_doc.sentences.append(
                    SentimentSentence(
                        sentence[0]["start"],
                        sentence[-1]["start"] + sentence[-1]["len"] - 1,
                        events,
                    )
                )
        return sentiment_doc

    def run_multiple(
        self,
        data: Union[str, PathLike] = None,
        parsed_data: Union[str, PathLike] = None,
        out_dir: Union[str, PathLike] = INFERENCE_OUT,
    ):
        if not parsed_data:
            if not self.parser:
                raise RuntimeError("Parser not initialized (try parse=True at init)")
            parsed_dir = Path(out_dir) / "parsed" / Path(data).stem
            parsed_data = self.parse_data(data, out_dir=parsed_dir)

        sentiment_docs = {}
        for f, parsed_doc in tqdm(_load_parsed_docs_from_dir(out_dir)):
            sentiment_doc = self.run(parsed_doc=parsed_doc)
            sentiment_docs[f] = sentiment_doc
        return sentiment_docs

    def _extract_intensifier_terms(self, toks, sentiment_index, polarity, sentence):
        """Extract intensifier events from sentence."""
        count = 0
        terms = []
        for intens_i, intens in [(i, x) for i, x in enumerate(toks) if x in self.intensifier_lex]:
            if math.fabs(sentiment_index - intens_i) == 1:
                score = self.intensifier_lex[intens].score
                terms.append(
                    Term(
                        intens,
                        TermType.INTENSIFIER,
                        polarity,
                        score,
                        sentence[intens_i]["start"],
                        sentence[intens_i]["len"],
                    )
                )
                count += abs(score + float(INTENSIFIER_FACTOR))
        return count if count != 0 else 1, terms

    def _extract_neg_terms(self, toks: list, op_i: int, sentence: list) -> tuple:
        """Extract negation terms from sentence.

        Args:
            toks: Sentence text broken down to tokens (words).
            op_i: Index of opinion term in sentence.
            sentence: parsed sentence

        Returns:
            List of negation terms and its aggregated sign (positive or negative).
        """
        sign = 1
        terms = []
        gov_op_i = sentence[op_i]["gov"]
        dep_op_indices = [sentence.index(x) for x in sentence if x["gov"] == op_i]
        for neg_i, negation in [(i, x) for i, x in enumerate(toks) if x in self.negation_lex]:
            position = self.negation_lex[negation].position
            dist = op_i - neg_i
            before = position == "before" and (dist == 1 or neg_i in dep_op_indices)
            after = position == "after" and (dist == -1 or neg_i == gov_op_i)
            both = position == "both" and dist in (1, -1)
            if before or after or both:
                terms.append(
                    Term(
                        negation,
                        TermType.NEGATION,
                        Polarity.NEG,
                        self.negation_lex[negation].score,
                        sentence[toks.index(negation)]["start"],
                        sentence[toks.index(negation)]["len"],
                    )
                )
                sign *= self.negation_lex[negation].score
        return terms, sign

    def _extract_event(self, aspect_row: LexiconElement, parsed_sentence: list) -> tuple:
        """Extract opinion and aspect terms from sentence."""
        event = []
        sent_aspect_pair = None
        real_aspect_indices = _consolidate_aspects(aspect_row.term, parsed_sentence)
        aspect_key = aspect_row.term[0]
        for aspect_index_range in real_aspect_indices:
            for word_index in aspect_index_range:
                sent_aspect_pair, event = self._detect_opinion_aspect_events(
                    word_index, parsed_sentence, aspect_key, aspect_index_range
                )
                if sent_aspect_pair:
                    break
        return sent_aspect_pair, event

    @staticmethod
    def _modify_for_multiple_word(cur_tkn, parsed_sentence, index_range):
        """Modify multiple-word aspect tkn length and start index.

        Args:
            index_range: The index range of the multi-word aspect.
        Returns:
            The modified aspect token.
        """
        if len(index_range) >= 2:
            cur_tkn["start"] = parsed_sentence[index_range[0]]["start"]
            cur_tkn["len"] = len(parsed_sentence[index_range[0]]["text"])
            for i in index_range[1:]:
                cur_tkn["len"] = int(cur_tkn["len"]) + len(parsed_sentence[i]["text"]) + 1
        return cur_tkn

    def _detect_opinion_aspect_events(self, aspect_index, parsed_sent, aspect_key, index_range):
        """Extract opinion-aspect events from sentence.

        Args:
            aspect_index: index of aspect in sentence.
            parsed_sent: current sentence parse tree.
            aspect_key: main aspect term serves as key in aspect dict.
            index_range: The index range of the multi word aspect.

        Returns:
            List of aspect sentiment pair, and list of events extracted.
        """
        all_pairs, events = [], []
        sentence_text_list = [x["text"] for x in parsed_sent]
        sentence_text = " ".join(sentence_text_list)
        for tok_i, tok in enumerate(parsed_sent):
            aspect_op_pair = []
            terms = []
            gov_i = tok["gov"]
            gov = parsed_sent[gov_i]
            gov_text = gov["text"]
            tok_text = tok["text"]

            # 1st order rules
            # Is cur_tkn an aspect and gov an opinion?
            if tok_i == aspect_index:
                if gov_text.lower() in self.opinion_lex:
                    aspect_op_pair.append(
                        (self._modify_for_multiple_word(tok, parsed_sent, index_range), gov)
                    )

            # Is gov an aspect and cur_tkn an opinion?
            if gov_i == aspect_index and tok_text.lower() in self.opinion_lex:
                aspect_op_pair.append(
                    (self._modify_for_multiple_word(gov, parsed_sent, index_range), tok)
                )

            # If not found, try 2nd order rules
            if not aspect_op_pair and tok_i == aspect_index:
                # 2nd order rule #1
                for op_t in parsed_sent:
                    if op_t["gov"] == gov_i and op_t["text"].lower() in self.opinion_lex:
                        aspect_op_pair.append(
                            (self._modify_for_multiple_word(tok, parsed_sent, index_range), op_t)
                        )

                # 2nd order rule #2
                gov_gov = parsed_sent[parsed_sent[gov_i]["gov"]]
                if gov_gov["text"].lower() in self.opinion_lex:
                    aspect_op_pair.append(
                        (self._modify_for_multiple_word(tok, parsed_sent, index_range), gov_gov)
                    )

            # if aspect_tok found
            for aspect, opinion in aspect_op_pair:
                op_tok_i = parsed_sent.index(opinion)
                score = self.opinion_lex[opinion["text"].lower()].score
                neg_terms, sign = self._extract_neg_terms(sentence_text_list, op_tok_i, parsed_sent)
                polarity = Polarity.POS if score * sign > 0 else Polarity.NEG
                intensifier_score, intensifier_terms = self._extract_intensifier_terms(
                    sentence_text_list, op_tok_i, polarity, parsed_sent
                )
                over_all_score = score * sign * intensifier_score
                terms.append(
                    Term(
                        aspect_key,
                        TermType.ASPECT,
                        polarity,
                        over_all_score,
                        aspect["start"],
                        aspect["len"],
                    )
                )
                terms.append(
                    Term(
                        opinion["text"],
                        TermType.OPINION,
                        polarity,
                        over_all_score,
                        opinion["start"],
                        opinion["len"],
                    )
                )
                if len(neg_terms) > 0:
                    terms = terms + neg_terms
                if len(intensifier_terms) > 0:
                    terms = terms + intensifier_terms
                all_pairs.append(
                    [aspect_key, opinion["text"], over_all_score, polarity, sentence_text]
                )
                events.append(terms)
        return all_pairs, events
コード例 #23
0
class IntentExtractionApi(AbstractApi):
    model_dir = str(LIBRARY_OUT / "intent-pretrained")
    pretrained_model_info = path.join(model_dir, "model_info.dat")
    pretrained_model = path.join(model_dir, "model.h5")

    def __init__(self, prompt=False):
        self.model = None
        self.model_type = None
        self.word_vocab = None
        self.tags_vocab = None
        self.char_vocab = None
        self.intent_vocab = None
        self._download_pretrained_model(prompt)
        self.nlp = SpacyInstance(
            disable=["tagger", "ner", "parser", "vectors", "textcat"])

    def process_text(self, text):
        input_text = " ".join(text.strip().split())
        return self.nlp.tokenize(input_text)

    @staticmethod
    def _prompt():
        response = input("\nTo download '{}', please enter YES: ".format(
            "intent_extraction"))
        res = response.lower().strip()
        if res == "yes" or (len(res) == 1 and res == "y"):
            print("Downloading {}...".format("ner"))
            responded_yes = True
        else:
            print("Download declined. Response received {} != YES|Y. ".format(
                res))
            responded_yes = False
        return responded_yes

    @staticmethod
    def _download_pretrained_model(prompt=True):
        """Downloads the pre-trained BIST model if non-existent."""
        model_info_exists = path.isfile(
            IntentExtractionApi.pretrained_model_info)
        model_exists = path.isfile(IntentExtractionApi.pretrained_model)
        if not model_exists or not model_info_exists:
            print(
                "The pre-trained models to be downloaded for the intent extraction dataset "
                "are licensed under Apache 2.0. By downloading, you accept the terms "
                "and conditions provided by the license")
            makedirs(IntentExtractionApi.model_dir, exist_ok=True)
            if prompt is True:
                agreed = IntentExtractionApi._prompt()
                if agreed is False:
                    sys.exit(0)
            download_unlicensed_file(
                "https://s3-us-west-2.amazonaws.com/nlp-architect-data"
                "/models/intent/",
                "model_info.dat",
                IntentExtractionApi.pretrained_model_info,
            )
            download_unlicensed_file(
                "https://s3-us-west-2.amazonaws.com/nlp-architect-data"
                "/models/intent/",
                "model.h5",
                IntentExtractionApi.pretrained_model,
            )
            print("Done.")

    @staticmethod
    def display_results(text_str, predictions, intent_type):
        ret = {
            "annotation_set": [],
            "doc_text": " ".join([t for t in text_str])
        }
        spans = []
        available_tags = set()
        for s, e, tag in bio_to_spans(text_str, predictions):
            spans.append({"start": s, "end": e, "type": tag})
            available_tags.add(tag)
        ret["annotation_set"] = list(available_tags)
        ret["spans"] = spans
        ret["title"] = intent_type
        return {"doc": ret, "type": "high_level"}

    def vectorize(self, doc, vocab, char_vocab=None):
        words = np.asarray([
            vocab[w.lower()] if w.lower() in vocab else 1 for w in doc
        ]).reshape(1, -1)
        if char_vocab is not None:
            sentence_chars = []
            for w in doc:
                word_chars = []
                for c in w:
                    if c in char_vocab:
                        _cid = char_vocab[c]
                    else:
                        _cid = 1
                    word_chars.append(_cid)
                sentence_chars.append(word_chars)
            sentence_chars = np.expand_dims(pad_sentences(
                sentence_chars, self.model.word_length),
                                            axis=0)
            return [words, sentence_chars]
        return words

    def inference(self, doc):
        text_arr = self.process_text(doc)
        intent_type = None
        if self.model_type == "mtl":
            doc_vec = self.vectorize(text_arr, self.word_vocab,
                                     self.char_vocab)
            intent, tags = self.model.predict(doc_vec, batch_size=1)
            intent = int(intent.argmax(1).flatten())
            intent_type = self.intent_vocab.get(intent, None)
            print("Detected intent type: {}".format(intent_type))
        else:
            doc_vec = self.vectorize(text_arr, self.word_vocab, None)
            tags = self.model.predict(doc_vec, batch_size=1)
        tags = tags.argmax(2).flatten()
        tag_str = [self.tags_vocab.get(n, None) for n in tags]
        for t, n in zip(text_arr, tag_str):
            print("{}\t{}\t".format(t, n))
        return self.display_results(text_arr, tag_str, intent_type)

    def load_model(self):
        with open(IntentExtractionApi.pretrained_model_info, "rb") as fp:
            model_info = pickle.load(fp)
        self.model_type = model_info["type"]
        self.word_vocab = model_info["word_vocab"]
        self.tags_vocab = {v: k for k, v in model_info["tags_vocab"].items()}
        if self.model_type == "mtl":
            self.char_vocab = model_info["char_vocab"]
            self.intent_vocab = {
                v: k
                for k, v in model_info["intent_vocab"].items()
            }
            model = MultiTaskIntentModel()
        else:
            model = Seq2SeqIntentModel()
        model.load(self.pretrained_model)
        self.model = model
コード例 #24
0
class IntentExtractionApi(AbstractApi):
    model_dir = path.join(LIBRARY_STORAGE_PATH, 'intent-pretrained')
    pretrained_model_info = path.join(model_dir, 'model_info.dat')
    pretrained_model = path.join(model_dir, 'model.h5')

    def __init__(self, prompt=True):
        self.model = None
        self.model_type = None
        self.word_vocab = None
        self.tags_vocab = None
        self.char_vocab = None
        self.intent_vocab = None
        self._download_pretrained_model(prompt)
        self.nlp = SpacyInstance(
            disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])

    def process_text(self, text):
        input_text = ' '.join(text.strip().split())
        return self.nlp.tokenize(input_text)

    @staticmethod
    def _prompt():
        response = input('\nTo download \'{}\', please enter YES: '.format(
            'intent_extraction'))
        res = response.lower().strip()
        if res == "yes" or (len(res) == 1 and res == 'y'):
            print('Downloading {}...'.format('ner'))
            responded_yes = True
        else:
            print('Download declined. Response received {} != YES|Y. '.format(
                res))
            responded_yes = False
        return responded_yes

    @staticmethod
    def _download_pretrained_model(prompt=True):
        """Downloads the pre-trained BIST model if non-existent."""
        model_info_exists = path.isfile(
            IntentExtractionApi.pretrained_model_info)
        model_exists = path.isfile(IntentExtractionApi.pretrained_model)
        if not model_exists or not model_info_exists:
            print(
                'The pre-trained models to be downloaded for the intent extraction dataset '
                'are licensed under Apache 2.0. By downloading, you accept the terms '
                'and conditions provided by the license')
            makedirs(IntentExtractionApi.model_dir, exist_ok=True)
            if prompt is True:
                agreed = IntentExtractionApi._prompt()
                if agreed is False:
                    sys.exit(0)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/intent/',
                'model_info.dat', IntentExtractionApi.pretrained_model_info)
            download_unlicensed_file(
                'http://nervana-modelzoo.s3.amazonaws.com/NLP/intent/',
                'model.h5', IntentExtractionApi.pretrained_model)
            print('Done.')

    @staticmethod
    def display_results(text_str, predictions, intent_type):
        ret = {
            'annotation_set': [],
            'doc_text': ' '.join([t for t in text_str])
        }
        spans = []
        available_tags = set()
        for s, e, tag in bio_to_spans(text_str, predictions):
            spans.append({'start': s, 'end': e, 'type': tag})
            available_tags.add(tag)
        ret['annotation_set'] = list(available_tags)
        ret['spans'] = spans
        ret['title'] = intent_type
        return {'doc': ret, 'type': 'high_level'}

    def vectorize(self, doc, vocab, char_vocab=None):
        words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc])\
            .reshape(1, -1)
        if char_vocab is not None:
            sentence_chars = []
            for w in doc:
                word_chars = []
                for c in w:
                    if c in char_vocab:
                        _cid = char_vocab[c]
                    else:
                        _cid = 1
                    word_chars.append(_cid)
                sentence_chars.append(word_chars)
            sentence_chars = np.expand_dims(pad_sentences(
                sentence_chars, self.model.word_length),
                                            axis=0)
            return [words, sentence_chars]
        return words

    def inference(self, doc):
        text_arr = self.process_text(doc)
        intent_type = None
        if self.model_type == 'mtl':
            doc_vec = self.vectorize(text_arr, self.word_vocab,
                                     self.char_vocab)
            intent, tags = self.model.predict(doc_vec, batch_size=1)
            intent = int(intent.argmax(1).flatten())
            intent_type = self.intent_vocab.get(intent, None)
            print('Detected intent type: {}'.format(intent_type))
        else:
            doc_vec = self.vectorize(text_arr, self.word_vocab, None)
            tags = self.model.predict(doc_vec, batch_size=1)
        tags = tags.argmax(2).flatten()
        tag_str = [self.tags_vocab.get(n, None) for n in tags]
        for t, n in zip(text_arr, tag_str):
            print('{}\t{}\t'.format(t, n))
        return self.display_results(text_arr, tag_str, intent_type)

    def load_model(self):
        with open(IntentExtractionApi.pretrained_model_info, 'rb') as fp:
            model_info = pickle.load(fp)
        self.model_type = model_info['type']
        self.word_vocab = model_info['word_vocab']
        self.tags_vocab = {v: k for k, v in model_info['tags_vocab'].items()}
        if self.model_type == 'mtl':
            self.char_vocab = model_info['char_vocab']
            self.intent_vocab = {
                v: k
                for k, v in model_info['intent_vocab'].items()
            }
            model = MultiTaskIntentModel()
        else:
            model = Seq2SeqIntentModel()
        model.load(self.pretrained_model)
        self.model = model
コード例 #25
0
        'chunker or \'nlp_arch\' for NLP Architect NP Extractor')

    args = arg_parser.parse_args()
    if args.corpus.endswith('gz'):
        corpus_file = gzip.open(args.corpus,
                                'rt',
                                encoding='utf8',
                                errors='ignore')
    else:
        corpus_file = open(args.corpus, 'r', encoding='utf8', errors='ignore')

    with open(args.marked_corpus, 'w', encoding='utf8') as marked_corpus_file:
        # load spacy parser
        logger.info('loading spacy')
        if 'nlp_arch' in args.chunker:
            nlp = SpacyInstance(model='en_core_web_sm',
                                disable=['textcat', 'ner', 'parser']).parser
            nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True)
            logger.info(
                'The pre-trained model to be downloaded for NLP Architect word'
                ' chunker model is licensed under Apache 2.0')
            _path_to_model = path.join(cur_dir, chunker_model_file)
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
            _path_to_params = path.join(cur_dir, chunker_model_dat_file)
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
            logger.info('Done.')
            nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                         last=True)
        else:
            nlp = SpacyInstance(model='en_core_web_sm',
コード例 #26
0
class NerApi(AbstractApi):
    """
    NER model API
    """

    model_dir = str(LIBRARY_OUT / "ner-pretrained")
    pretrained_model = path.join(model_dir, "model_v4.h5")
    pretrained_model_info = path.join(model_dir, "model_info_v4.dat")

    def __init__(self, prompt=True):
        self.model = None
        self.model_info = None
        self.word_vocab = None
        self.y_vocab = None
        self.char_vocab = None
        self._download_pretrained_model(prompt)
        self.nlp = SpacyInstance(
            disable=["tagger", "ner", "parser", "vectors", "textcat"])

    @staticmethod
    def _prompt():
        response = input(
            "\nTo download '{}', please enter YES: ".format("ner"))
        res = response.lower().strip()
        if res == "yes" or (len(res) == 1 and res == "y"):
            print("Downloading {}...".format("ner"))
            responded_yes = True
        else:
            print("Download declined. Response received {} != YES|Y. ".format(
                res))
            responded_yes = False
        return responded_yes

    def _download_pretrained_model(self, prompt=True):
        """Downloads the pre-trained BIST model if non-existent."""
        model_exists = path.isfile(self.pretrained_model)
        model_info_exists = path.isfile(self.pretrained_model_info)
        if not model_exists or not model_info_exists:
            print(
                "The pre-trained models to be downloaded for the NER dataset "
                "are licensed under Apache 2.0. By downloading, you accept the terms "
                "and conditions provided by the license")
            makedirs(self.model_dir, exist_ok=True)
            if prompt is True:
                agreed = NerApi._prompt()
                if agreed is False:
                    sys.exit(0)
            download_unlicensed_file(
                "https://s3-us-west-2.amazonaws.com/nlp-architect-data"
                "/models/ner/",
                "model_v4.h5",
                self.pretrained_model,
            )
            download_unlicensed_file(
                "https://s3-us-west-2.amazonaws.com/nlp-architect-data"
                "/models/ner/",
                "model_info_v4.dat",
                self.pretrained_model_info,
            )
            print("Done.")

    def load_model(self):
        self.model = NERCRF()
        self.model.load(self.pretrained_model)
        with open(self.pretrained_model_info, "rb") as fp:
            model_info = pickle.load(fp)
        self.word_vocab = model_info["word_vocab"]
        self.y_vocab = {v: k for k, v in model_info["y_vocab"].items()}
        self.char_vocab = model_info["char_vocab"]

    @staticmethod
    def pretty_print(text, tags):
        spans = []
        for s, e, tag in bio_to_spans(text, tags):
            spans.append({"start": s, "end": e, "type": tag})
        ents = dict((obj["type"].lower(), obj) for obj in spans).keys()
        ret = {
            "doc_text": " ".join(text),
            "annotation_set": list(ents),
            "spans": spans,
            "title": "None",
        }
        print({"doc": ret, "type": "high_level"})
        return {"doc": ret, "type": "high_level"}

    def process_text(self, text):
        input_text = " ".join(text.strip().split())
        return self.nlp.tokenize(input_text)

    def vectorize(self, doc, vocab, char_vocab):
        words = np.asarray([
            vocab[w.lower()] if w.lower() in vocab else 1 for w in doc
        ]).reshape(1, -1)
        sentence_chars = []
        for w in doc:
            word_chars = []
            for c in w:
                if c in char_vocab:
                    _cid = char_vocab[c]
                else:
                    _cid = 1
                word_chars.append(_cid)
            sentence_chars.append(word_chars)
        sentence_chars = np.expand_dims(pad_sentences(sentence_chars,
                                                      self.model.word_length),
                                        axis=0)
        return words, sentence_chars

    def inference(self, doc):
        text_arr = self.process_text(doc)
        doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab)
        seq_len = np.array([len(text_arr)]).reshape(-1, 1)
        inputs = list(doc_vec)
        # pylint: disable=no-member
        inputs = list(doc_vec) + [seq_len]
        doc_ner = self.model.predict(inputs, batch_size=1).argmax(2).flatten()
        tags = [self.y_vocab.get(n, None) for n in doc_ner]
        return self.pretty_print(text_arr, tags)
コード例 #27
0
ファイル: inference.py プロジェクト: neuroph12/intel_nlp
                              '{}.params'.format(str(args.model_name)))
    validate_existing_filepath(model_path)
    validate_existing_filepath(settings_path)

    # load model and parameters
    model = SequenceChunker()
    model.load(model_path)
    word_length = model.max_word_len
    with open(settings_path, 'rb') as fp:
        model_params = pickle.load(fp)
        word_vocab = model_params['word_vocab']
        chunk_vocab = model_params['chunk_vocab']
        char_vocab = model_params.get('char_vocab', None)

    # parse documents and get tokens
    nlp = SpacyInstance(
        disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
    with open(args.input_file) as fp:
        document_texts = [nlp.tokenize(t.strip()) for t in fp.readlines()]

    # vectorize input tokens and run inference
    doc_vecs = vectorize(document_texts, word_vocab, char_vocab)
    document_annotations = []
    for vec in doc_vecs:
        doc_chunks = model.predict(vec, batch_size=args.b)
        chunk_a = [
            chunk_vocab.id_to_word(l) for l in doc_chunks.argmax(2).flatten()
        ]
        document_annotations.append(chunk_a)

    # print document text and annotations
    build_annotation(document_texts, document_annotations)
コード例 #28
0
ファイル: wiki_online.py プロジェクト: neuroph12/intel_nlp
class WikiOnline(object):
    def __init__(self):
        import pywikibot
        self.spacy = SpacyInstance()
        self.pywikibot = pywikibot
        self.cache = dict()
        self.site = pywikibot.Site(
            'en', 'wikipedia')  # The site we want to run our bot on

    def get_pages(self, phrase):
        if phrase in self.cache:
            return self.cache[phrase]

        ret_pages = set()
        word_clean = phrase.replace('-', ' ')
        word_lower = word_clean.lower()
        word_upper = word_clean.upper()
        word_title = word_clean.title()
        words_set = {phrase, word_clean, word_lower, word_upper, word_title}
        for appr in words_set:
            try:
                page_result = self.get_page_redirect(appr)
                if page_result.pageid != 0:
                    full_page = self.get_wiki_page_with_items(
                        phrase, page_result)
                    ret_pages.add(WikipediaSearchPageResult(appr, full_page))
            except Exception as e:
                logger.error(e)

        self.cache[phrase] = ret_pages
        return ret_pages

    # pylint: disable=protected-access
    def get_wiki_page_with_items(self, phrase, page):
        item = self.get_wiki_page_item(page)
        pageid = page.pageid
        aliases = self.get_aliases(item)
        description = self.get_description(item)
        text = page.text
        page_title = page._link._title

        relations = WikipediaPageExtractedRelations()
        relations.is_disambiguation = self.is_disambiguation_page(item)
        relations.is_part_name = self.is_name_description(
            text, item, relations.is_disambiguation)
        relations.aliases = aliases
        relations.be_comp, relations.be_comp_norm = self.extract_be_comp(text)
        relations.extract_relations_from_text_v0(text)

        ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid,
                                 description, relations)

        logger.debug('Page: {}. Extracted successfully'.format(ret_page))

        return ret_page

    def get_wiki_page_item(self, page):
        if page is not None:
            try:
                item = self.pywikibot.ItemPage.fromPage(
                    page)  # this can be used for any page object
                item.get()  # need to call it to access any data.
                return item
            except (self.pywikibot.NoPage, AttributeError, TypeError,
                    NameError):
                pass
        return None

    def get_page_redirect(self, word):
        page = self.pywikibot.Page(self.site, word)
        if page.pageid != 0 and page.isRedirectPage():
            return page.getRedirectTarget()
        return page

    @staticmethod
    def get_aliases(item):
        if item is not None and item.aliases is not None:
            if 'en' in item.aliases:
                aliases = item.aliases['en']
                return aliases

        return None

    @staticmethod
    def get_description(item):
        description = {}
        if item is not None:
            item_desc = item.get()
            if 'desctiptions' in item_desc and 'en' in item_desc[
                    'descriptions']:
                dict([("age", 25)])
                description['descriptions'] = dict([
                    ('en', item_desc['descriptions']['en'])
                ])

        return description

    @staticmethod
    def is_disambiguation_page(item):
        if item is not None:
            dic = item.get()
            if dic is not None and 'descriptions' in dic:
                desc = dic['descriptions']
                if desc is not None and 'en' in desc:
                    return desc['en'].lower() in DISAMBIGUATE_PAGE

        return False

    @staticmethod
    def is_name_description(text, item, is_disambiguation):
        if item is not None:
            if is_disambiguation:
                if WikipediaPageExtractedRelations.is_name_part(text):
                    return True
            else:
                dic = item.get()
                if dic is not None and 'descriptions' in dic:
                    desc = dic['descriptions']
                    if desc is not None and 'en' in desc:
                        if [
                                s for s in NAME_DESCRIPTIONS
                                if s in desc['en'].lower()
                        ]:
                            return True
        return False

    # pylint: disable=no-else-return
    def extract_be_comp(self, text):
        first_sentence_start_index = text.index("'''")
        if first_sentence_start_index >= 0:
            last_temp_index = text.find('\n', first_sentence_start_index)
        if last_temp_index == -1:
            last_temp_index = len(text)

        first_paragraph = text[first_sentence_start_index:last_temp_index]
        if WikiOnline.extract_be_a_index(
                first_paragraph) == -1 and last_temp_index != len(text):
            return self.extract_be_comp(text[last_temp_index:])
        elif last_temp_index == len(text):
            return None, None

        first_paragraph_clean = re.sub(r'\([^)]*\)', '', first_paragraph)
        first_paragraph_clean = re.sub(r'<[^>]*>', '', first_paragraph_clean)
        first_paragraph_clean = re.sub(r'{[^}]*}', '', first_paragraph_clean)
        first_paragraph_clean = re.sub(r'\[\[[^]]*\]\]', '',
                                       first_paragraph_clean)
        first_paragraph_clean = re.sub(r'[\']', '', first_paragraph_clean)
        first_paragraph_clean = re.sub(r'&nbsp;', ' ', first_paragraph_clean)

        return self.extract_be_comp_relations(first_paragraph_clean)

    # pylint: disable=not-callable
    def extract_be_comp_relations(self, first_paragraph):
        be_comp = set()
        be_comp_norm = set()
        if first_paragraph:
            doc = self.spacy.parser(first_paragraph)
            for token in doc:
                target = token.text
                target_lemma = token.lemma_
                relation = token.dep_
                governor = token.head.text
                governor_lemma = token.head.lemma_
                if relation == 'acl':
                    break
                if relation == 'punct' and target == '.':
                    break
                elif relation == 'cop':
                    be_comp.add(governor)
                    be_comp_norm.add(governor_lemma)
                elif relation == 'nsubj':
                    be_comp.add(target)
                    be_comp_norm.add(target_lemma)
                elif relation == 'dep':
                    be_comp.add(governor)
                    be_comp_norm.add(governor_lemma)
                elif relation == 'compound':
                    be_comp.add(target + ' ' + governor)
                    be_comp_norm.add(target_lemma + ' ' + governor_lemma)
                elif relation == 'amod':
                    be_comp.add(target + ' ' + governor)
                    be_comp_norm.add(target_lemma + ' ' + governor_lemma)
                elif relation in ['conj', 'appos']:
                    be_comp.add(target)
                    be_comp_norm.add(target_lemma)

        return be_comp, be_comp_norm

    @staticmethod
    def extract_be_a_index(sentence):
        result = None
        if 'is a' in sentence:
            result = sentence.index("is a")
        elif 'are a' in sentence:
            result = sentence.index("are a")
        elif 'was a' in sentence:
            result = sentence.index("was a")
        elif 'were a' in sentence:
            result = sentence.index("were a")
        elif 'be a' in sentence:
            result = sentence.index("be a")
        elif 'is the' in sentence:
            result = sentence.index("is the")
        elif 'are the' in sentence:
            result = sentence.index("are the")
        elif 'was the' in sentence:
            result = sentence.index("was the")
        elif 'were the' in sentence:
            result = sentence.index("were the")
        elif 'be the' in sentence:
            result = sentence.index("be the")

        return result
コード例 #29
0
ファイル: np_scorer.py プロジェクト: yehuangcn/nlp-architect
class NPScorer(object):
    def __init__(self, parser=None):
        if parser is None:
            self.nlp = SpacyInstance(
                disable=["ner", "parser", "vectors", "textcat"]).parser
        else:
            self.nlp = parser

        self.nlp.add_pipe(self.nlp.create_pipe("sentencizer"), first=True)
        _path_to_model = path.join(chunker_local_path, chunker_model_file)
        if not path.exists(chunker_local_path):
            makedirs(chunker_local_path)
        if not path.exists(_path_to_model):
            logger.info(
                "The pre-trained model to be downloaded for NLP Architect word"
                " chunker model is licensed under Apache 2.0")
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
        _path_to_params = path.join(chunker_local_path, chunker_model_dat_file)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
        self.nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                          last=True)

    def score_documents(self,
                        texts: list,
                        limit=-1,
                        return_all=False,
                        min_tf=5):
        documents = []
        assert len(texts) > 0, "texts should contain at least 1 document"
        assert min_tf > 0, "min_tf should be at least 1"
        with tqdm(total=len(texts),
                  desc="documents scoring progress",
                  unit="docs") as pbar:
            for doc in self.nlp.pipe(texts, n_threads=-1):
                if len(doc) > 0:
                    documents.append(doc)
                pbar.update(1)

        corpus = []
        for doc in documents:
            spans = get_noun_phrases(doc)
            if len(spans) > 0:
                corpus.append((doc, spans))

        if len(corpus) < 1:
            return []

        documents, doc_phrases = list(zip(*corpus))
        scorer = TextSpanScoring(documents=documents,
                                 spans=doc_phrases,
                                 min_tf=min_tf)
        tfidf_scored_list = scorer.get_tfidf_scores()
        if len(tfidf_scored_list) < 1:
            return []
        cvalue_scored_list = scorer.get_cvalue_scores()
        freq_scored_list = scorer.get_freq_scores()

        if limit > 0:
            tf = {tuple(k[0]): k[1] for k in tfidf_scored_list}
            cv = {tuple(k[0]): k[1] for k in cvalue_scored_list}
            fr = {tuple(k[0]): k[1] for k in freq_scored_list}
            tfidf_scored_list_limit = []
            cvalue_scored_list_limit = []
            freq_scored_list_limit = []
            for phrase in list(zip(*tfidf_scored_list))[0][:limit]:
                tfidf_scored_list_limit.append((phrase, tf[tuple(phrase)]))
                cvalue_scored_list_limit.append((phrase, cv[tuple(phrase)]))
                freq_scored_list_limit.append((phrase, fr[tuple(phrase)]))
            tfidf_scored_list = tfidf_scored_list_limit
            cvalue_scored_list = cvalue_scored_list_limit
            freq_scored_list = freq_scored_list_limit

        tfidf_scored_list = scorer.normalize_l2(tfidf_scored_list)
        cvalue_scored_list = scorer.normalize_l2(cvalue_scored_list)
        freq_scored_list = scorer.normalize_minmax(freq_scored_list,
                                                   invert=True)
        tfidf_scored_list = scorer.normalize_minmax(tfidf_scored_list)
        cvalue_scored_list = scorer.normalize_minmax(cvalue_scored_list)
        if return_all:
            tf = {tuple(k[0]): k[1] for k in tfidf_scored_list}
            cv = {tuple(k[0]): k[1] for k in cvalue_scored_list}
            fr = {tuple(k[0]): k[1] for k in freq_scored_list}
            final_list = []
            for phrases in tf.keys():
                final_list.append(([p for p in phrases], tf[phrases],
                                   cv[phrases], fr[phrases]))
            return final_list
        merged_list = scorer.interpolate_scores(
            [tfidf_scored_list, cvalue_scored_list], [0.5, 0.5])
        merged_list = scorer.multiply_scores([merged_list, freq_scored_list])
        merged_list = scorer.normalize_minmax(merged_list)
        final_list = []
        for phrases, score in merged_list:
            if any([len(p) > 1 for p in phrases]):
                final_list.append(([p for p in phrases], score))
        return final_list
コード例 #30
0
                              "{}.params".format(str(args.model_name)))
    validate_existing_filepath(model_path)
    validate_existing_filepath(settings_path)

    # load model and parameters
    model = SequenceChunker()
    model.load(model_path)
    word_length = model.max_word_len
    with open(settings_path, "rb") as fp:
        model_params = pickle.load(fp)
        word_vocab = model_params["word_vocab"]
        chunk_vocab = model_params["chunk_vocab"]
        char_vocab = model_params.get("char_vocab", None)

    # parse documents and get tokens
    nlp = SpacyInstance(
        disable=["tagger", "ner", "parser", "vectors", "textcat"])
    with open(args.input_file) as fp:
        document_texts = [nlp.tokenize(t.strip()) for t in fp.readlines()]

    # vectorize input tokens and run inference
    doc_vecs = vectorize(document_texts, word_vocab, char_vocab)
    document_annotations = []
    for vec in doc_vecs:
        doc_chunks = model.predict(vec, batch_size=args.b)
        chunk_a = [
            chunk_vocab.id_to_word(l) for l in doc_chunks.argmax(2).flatten()
        ]
        document_annotations.append(chunk_a)

    # print document text and annotations
    build_annotation(document_texts, document_annotations)
コード例 #31
0
ファイル: wiki_online.py プロジェクト: yyzreal/nlp-architect
class WikiOnline(object):
    def __init__(self):
        try:
            import pywikibot
        except (AttributeError, ImportError):
            logger.error(
                "pywikibot is not installed, please install nlp_architect with [all] package. "
                + "for example: pip install nlp_architect[all]")
            sys.exit()
        self.spacy = SpacyInstance()
        self.pywikibot = pywikibot
        self.cache = dict()
        self.site = pywikibot.Site(
            "en", "wikipedia")  # The site we want to run our bot on

    def get_pages(self, phrase):
        if phrase in self.cache:
            return self.cache[phrase]

        ret_pages = set()
        word_clean = phrase.replace("-", " ")
        word_lower = word_clean.lower()
        word_upper = word_clean.upper()
        word_title = word_clean.title()
        words_set = {phrase, word_clean, word_lower, word_upper, word_title}
        for appr in words_set:
            try:
                page_result = self.get_page_redirect(appr)
                if page_result.pageid != 0:
                    full_page = self.get_wiki_page_with_items(
                        phrase, page_result)
                    ret_pages.add(WikipediaSearchPageResult(appr, full_page))
            except Exception as e:
                logger.error(e)

        self.cache[phrase] = ret_pages
        return ret_pages

    # pylint: disable=protected-access
    def get_wiki_page_with_items(self, phrase, page):
        item = self.get_wiki_page_item(page)
        pageid = page.pageid
        aliases = self.get_aliases(item)
        description = self.get_description(item)
        text = page.text
        page_title = page._link._title

        relations = WikipediaPageExtractedRelations()
        relations.is_disambiguation = self.is_disambiguation_page(item)
        relations.is_part_name = self.is_name_description(
            text, item, relations.is_disambiguation)
        relations.aliases = aliases
        relations.be_comp, relations.be_comp_norm = self.extract_be_comp(text)
        relations.extract_relations_from_text_v0(text)

        ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid,
                                 description, relations)

        logger.debug("Page: {}. Extracted successfully".format(ret_page))

        return ret_page

    def get_wiki_page_item(self, page):
        if page is not None:
            try:
                item = self.pywikibot.ItemPage.fromPage(
                    page)  # this can be used for any page object
                item.get()  # need to call it to access any data.
                return item
            except (self.pywikibot.NoPage, AttributeError, TypeError,
                    NameError):
                pass
        return None

    def get_page_redirect(self, word):
        page = self.pywikibot.Page(self.site, word)
        if page.pageid != 0 and page.isRedirectPage():
            return page.getRedirectTarget()
        return page

    @staticmethod
    def get_aliases(item):
        if item is not None and item.aliases is not None:
            if "en" in item.aliases:
                aliases = item.aliases["en"]
                return aliases

        return None

    @staticmethod
    def get_description(item):
        description = {}
        if item is not None:
            item_desc = item.get()
            if "desctiptions" in item_desc and "en" in item_desc[
                    "descriptions"]:
                dict([("age", 25)])
                description["descriptions"] = dict([
                    ("en", item_desc["descriptions"]["en"])
                ])

        return description

    @staticmethod
    def is_disambiguation_page(item):
        if item is not None:
            dic = item.get()
            if dic is not None and "descriptions" in dic:
                desc = dic["descriptions"]
                if desc is not None and "en" in desc:
                    return desc["en"].lower() in DISAMBIGUATE_PAGE

        return False

    @staticmethod
    def is_name_description(text, item, is_disambiguation):
        if item is not None:
            if is_disambiguation:
                if WikipediaPageExtractedRelations.is_name_part(text):
                    return True
            else:
                dic = item.get()
                if dic is not None and "descriptions" in dic:
                    desc = dic["descriptions"]
                    if desc is not None and "en" in desc:
                        if [
                                s for s in NAME_DESCRIPTIONS
                                if s in desc["en"].lower()
                        ]:
                            return True
        return False

    # pylint: disable=no-else-return
    def extract_be_comp(self, text):
        first_sentence_start_index = text.index("'''")
        if first_sentence_start_index >= 0:
            last_temp_index = text.find("\n", first_sentence_start_index)
        if last_temp_index == -1:
            last_temp_index = len(text)

        first_paragraph = text[first_sentence_start_index:last_temp_index]
        if WikiOnline.extract_be_a_index(
                first_paragraph) == -1 and last_temp_index != len(text):
            return self.extract_be_comp(text[last_temp_index:])
        elif last_temp_index == len(text):
            return None, None

        first_paragraph_clean = re.sub(r"\([^)]*\)", "", first_paragraph)
        first_paragraph_clean = re.sub(r"<[^>]*>", "", first_paragraph_clean)
        first_paragraph_clean = re.sub(r"{[^}]*}", "", first_paragraph_clean)
        first_paragraph_clean = re.sub(r"\[\[[^]]*\]\]", "",
                                       first_paragraph_clean)
        first_paragraph_clean = re.sub(r"[\']", "", first_paragraph_clean)
        first_paragraph_clean = re.sub(r"&nbsp;", " ", first_paragraph_clean)

        return self.extract_be_comp_relations(first_paragraph_clean)

    # pylint: disable=not-callable
    def extract_be_comp_relations(self, first_paragraph):
        be_comp = set()
        be_comp_norm = set()
        if first_paragraph:
            doc = self.spacy.parser(first_paragraph)
            for token in doc:
                target = token.text
                target_lemma = token.lemma_
                relation = token.dep_
                governor = token.head.text
                governor_lemma = token.head.lemma_
                if relation == "acl":
                    break
                if relation == "punct" and target == ".":
                    break
                elif relation == "cop":
                    be_comp.add(governor)
                    be_comp_norm.add(governor_lemma)
                elif relation == "nsubj":
                    be_comp.add(target)
                    be_comp_norm.add(target_lemma)
                elif relation == "dep":
                    be_comp.add(governor)
                    be_comp_norm.add(governor_lemma)
                elif relation == "compound":
                    be_comp.add(target + " " + governor)
                    be_comp_norm.add(target_lemma + " " + governor_lemma)
                elif relation == "amod":
                    be_comp.add(target + " " + governor)
                    be_comp_norm.add(target_lemma + " " + governor_lemma)
                elif relation in ["conj", "appos"]:
                    be_comp.add(target)
                    be_comp_norm.add(target_lemma)

        return be_comp, be_comp_norm

    @staticmethod
    def extract_be_a_index(sentence):
        result = None
        if "is a" in sentence:
            result = sentence.index("is a")
        elif "are a" in sentence:
            result = sentence.index("are a")
        elif "was a" in sentence:
            result = sentence.index("was a")
        elif "were a" in sentence:
            result = sentence.index("were a")
        elif "be a" in sentence:
            result = sentence.index("be a")
        elif "is the" in sentence:
            result = sentence.index("is the")
        elif "are the" in sentence:
            result = sentence.index("are the")
        elif "was the" in sentence:
            result = sentence.index("was the")
        elif "were the" in sentence:
            result = sentence.index("were the")
        elif "be the" in sentence:
            result = sentence.index("be the")

        return result