def test_backpointer():
    nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en')
    doc = nlp(EN_DOC2)
    ent = doc.ents[0]
    assert ent.sent is doc.sentences[0]
    assert list(doc.iter_words())[0].sent is doc.sentences[0]
    assert list(doc.iter_tokens())[-1].sent is doc.sentences[-1]
Beispiel #2
0
 def __init__(self, port=9001):
     self.nlp = stanza.Pipeline('en')  # initialize English neural pipeline
     self.client = CoreNLPClient(
         annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'parse'],
         timeout=60000,
         memory='4G',
         endpoint=f'http://localhost:{port}')
def test_readonly():
    Document.add_property('some_property', 123)
    nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en')
    doc = nlp(EN_DOC)
    assert doc.some_property == 123
    with pytest.raises(ValueError):
        doc.some_property = 456
def load_conll_data(file_path):
    instances = []
    words = []
    labels = []
    with open(file_path, 'r') as fp:
        for line in fp:
            line = line.strip()
            if len(line) == 0:
                if len(words) != 0:
                    instance = dict(words=words, labels=labels)
                    instances.append(instance)
                words = []
                labels = []
            else:
                columns = line.split()
                words.append(columns[0])
                labels.append(columns[-1])

    if len(words) != 0:
        instance = dict(words=words, labels=labels)
        instances.append(instance)

    nlp = stanza.Pipeline(lang='en', tokenize_pretokenized=True)

    for instance in tqdm(instances):
        doc = nlp([instance['words']])
        sentence = doc.sentences[0]
        dep_head = [''] * len(instance['words'])
        for i, word in enumerate(sentence.words):
            dep_head[i] = word.head - 1
        instance['heads'] = dep_head

    return instances
Beispiel #5
0
def test_tokenize():
    nlp = stanza.Pipeline(processors='tokenize',
                          dir=TEST_MODELS_DIR,
                          lang='en')
    doc = nlp(EN_DOC)
    assert EN_DOC_GOLD_TOKENS == '\n\n'.join(
        [sent.tokens_string() for sent in doc.sentences])
Beispiel #6
0
def test_no_ssplit():
    nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en',
                                  'tokenize_no_ssplit': True})

    doc = nlp(EN_DOC_NO_SSPLIT)
    assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words] for s in doc.sentences]
    assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
Beispiel #7
0
def test_jieba():
    nlp = stanza.Pipeline(lang='zh', dir=TEST_MODELS_DIR, processors={'tokenize': 'jieba'}, package=None)
    doc = nlp(ZH_DOC)

    assert "JiebaTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__
    assert ZH_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
    assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
    def __load_pipeline(self):
        Globals = globals()
        Globals_keys = set(Globals.keys())
        pipeline_vars = set(['s_nlp','s_nlp_lang','s_nlp_path'])

        is_loaded = pipeline_vars.issubset(Globals_keys) and Globals['s_nlp'] is not None
        if is_loaded:
            same_processors = set(Globals['s_nlp'].processors.keys()) == set(['tokenize', 'depparse', 'pos', 'lemma'])
            same_gpu_use = not Globals['s_nlp'].use_gpu 
            same_lang = Globals['s_nlp_lang'] == self.lang
            same_path = Globals['s_nlp_path'] == self.stanza_path
            same_setup = same_lang and same_path and same_processors and same_gpu_use
        else:
            same_setup = False
        
        if not is_loaded or not same_setup:
            if self.__globalize_stanza:
                global s_nlp
                global s_nlp_lang
                global s_nlp_path
            s_nlp_lang = self.lang
            s_nlp_path = self.stanza_path
            s_nlp = stanza.Pipeline(
                lang = s_nlp_lang, dir = s_nlp_path,
                processors = "tokenize,lemma,pos,depparse")
        
        return s_nlp 
Beispiel #9
0
def test_sudachipy_no_ssplit():
    nlp = stanza.Pipeline(lang='ja', dir=TEST_MODELS_DIR, processors={'tokenize': 'sudachipy'}, tokenize_no_ssplit=True, package=None)
    doc = nlp(JA_DOC)

    assert "SudachiPyTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__
    assert JA_DOC_GOLD_NOSSPLIT_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
    assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
    def _sent_split_corpus(self, arr_input_text):
        """ tokenize corpus given tokenizer by config file"""
        # arr_input_text = pd_input['essay'].values

        # num_over = 0
        # total_sent = 0

        import stanza  # stanford library for tokenizer
        tokenizer_stanza = stanza.Pipeline('en', processors='tokenize', use_gpu=True)

        num_sents = []
        sent_corpus = []  # tokenized to form of [doc, list of sentences]
        for cur_doc in arr_input_text:
            cur_doc = self._refine_text(cur_doc)  # cur_doc: single string
            
            # sent_list = [sent.string.strip() for sent in spacy_nlp(cur_doc).sents] # spacy style

            ## stanza version
            doc_stanza = tokenizer_stanza(cur_doc)
            sent_list = [sentence.text for sentence in doc_stanza.sentences]
           
            ## normal version
            # sent_list = self.sent_tokenzier(cur_doc)  # following exactly same way with previous works
            
            sent_corpus.append(sent_list)
            num_sents.append(len(sent_list))

        return sent_corpus, num_sents
Beispiel #11
0
def main(args):
    """Visualization of contexts, questions, and colored answer spans."""

    # Load dataset, and optionally shuffle.
    dataset = QADataset(args, args.path)
    samples = dataset.samples
    if args.shuffle:
        random.shuffle(samples)

    vis_samples = samples[:args.samples]

    print()
    print('-' * RULE_LENGTH)
    print()

    # Visualize samples.
    for (qid, context, question, answer_start, answer_end) in vis_samples[:10]:
        cxt = _build_string(context)
        print(cxt)
        stanza.download('en')
        en_nlp = stanza.Pipeline('en')
        en_doc = en_nlp(cxt)

        for i, sent in enumerate(en_doc.sentences):
            print(f"[Sentence {i+1}")
            for word in sent.words:
                print("{:12s}\t{:12s}\t{:6s}\t{:d}\t{:12s}".format(
                    word.text, word.lemma, word.pos, word.head, word.deprel))
                print("")

        print("Mention text\tType\tStart-End")
        for ent in en_doc.ents:
            print("{}\t{}\t{}-{}".format(ent.text, ent.type, ent.start_char,
                                         ent.end_char))
Beispiel #12
0
def stanza_gen(texts,
               lang,
               processors="tokenize,mwt,lemma,pos,depparse,ner",
               stanza_path=os.path.join(str(Path.home()), 'stanza_resources'),
               verbose=True,
               **kwargs):
    """
    texts (iter): an iterator of strings
    lang (str): language code
    stanza_path (path): the path for saving stanza resources

    Examples:
    >>> sg = stanza_gen(texts = ["dette er en test text"], lang = "da",\
        verbose=False)
    >>> type(sg)
    <class 'generator'>
    >>> sg_unpacked = list(sg)
    >>> type(sg_unpacked[0])
    <class 'pandas.core.frame.DataFrame'>
    """
    s_nlp = stanza.Pipeline(lang=lang,
                            processors=processors,
                            dir=stanza_path,
                            verbose=verbose,
                            **kwargs)
    for text in texts:
        doc = s_nlp(text)

        sent_ids = dict()
        sent_n = None

        def __get_ent(n_sent, sent, word):
            nonlocal sent_ids
            nonlocal sent_n
            if sent_n != n_sent:
                sent_ids = {
                    word.id: ent.type
                    for ent in sent.ents for word in ent.words
                }
            if word.id in sent_ids:
                return sent_ids[word.id]

        # extract from doc
        extr = (
            (
                n_sent,  # sentence number
                word.text,
                word.lemma,
                word.upos,
                word.xpos,  # pos-tags
                word.deprel,
                __get_ent(n_sent, sent, word))
            for n_sent, sent in enumerate(doc.sentences)
            for word in sent.words)
        cols = [
            "n_sent", "token", "lemma", "upos", "xpos", "dependency relation",
            "ner"
        ]
        yield pd.DataFrame(extr, columns=cols)
Beispiel #13
0
def apply_packaged_translation(pkg, input_text, translator, num_hypotheses=4):
    """Applies the translation in pkg to translate input_text.

    Args:
        pkg (Package): The package that provides the translation.
        input_text (str): The text to be translated.
        translator (ctranslate2.Translator): The CTranslate2 Translator
        num_hypotheses (int): The number of hypotheses to generate

    Returns:
        [Hypothesis]: A list of Hypothesis's for translating input_text

    """

    info('apply_packaged_translation')
    sp_model_path = str(pkg.package_path / 'sentencepiece.model')
    sp_processor = spm.SentencePieceProcessor(model_file=sp_model_path)
    stanza_pipeline = stanza.Pipeline(lang=pkg.from_code,
                                      dir=str(pkg.package_path / 'stanza'),
                                      processors='tokenize',
                                      use_gpu=False,
                                      logging_level='WARNING')
    stanza_sbd = stanza_pipeline(input_text)
    sentences = [sentence.text for sentence in stanza_sbd.sentences]
    info('sentences', sentences)
    tokenized = [
        sp_processor.encode(sentence, out_type=str) for sentence in sentences
    ]
    info('tokenized', tokenized)
    BATCH_SIZE = 32
    assert (len(sentences) <= BATCH_SIZE)
    translated_batches = translator.translate_batch(
        tokenized,
        replace_unknowns=True,
        max_batch_size=BATCH_SIZE,
        beam_size=num_hypotheses,
        num_hypotheses=num_hypotheses,
        length_penalty=0.2)
    info('translated_batches', translated_batches)

    # Build hypotheses
    value_hypotheses = []
    for i in range(num_hypotheses):
        translated_tokens = []
        cumulative_score = 0
        for translated_batch in translated_batches:
            translated_tokens += translated_batch[i]['tokens']
            cumulative_score += translated_batch[i]['score']
        detokenized = ''.join(translated_tokens)
        detokenized = detokenized.replace('▁', ' ')
        value = detokenized
        if len(value) > 0 and value[0] == ' ':
            # Remove space at the beginning of the translation added
            # by the tokenizer.
            value = value[1:]
        hypothesis = Hypothesis(value, cumulative_score)
        value_hypotheses.append(hypothesis)
    info('value_hypotheses', value_hypotheses)
    return value_hypotheses
Beispiel #14
0
def test_missing_requirements():
    """
    Try to build several pipelines with bad configs and check thrown exceptions against gold exceptions.
    :return: None
    """
    # list of (bad configs, list of gold ProcessorRequirementsExceptions that should be thrown) pairs
    bad_config_lists = [
        # missing tokenize
        (
            # input config
            {
                'processors': 'pos,depparse',
                'dir': TEST_MODELS_DIR,
                'lang': 'en'
            },
            # 2 expected exceptions
            [{
                'processor_type': 'POSProcessor',
                'processors_list': ['pos', 'depparse'],
                'provided_reqs': set([]),
                'requires': set(['tokenize'])
            }, {
                'processor_type': 'DepparseProcessor',
                'processors_list': ['pos', 'depparse'],
                'provided_reqs': set([]),
                'requires': set(['tokenize', 'pos', 'lemma'])
            }]),
        # no pos when lemma_pos set to True; for english mwt should not be included in the loaded processor list
        (
            # input config
            {
                'processors': 'tokenize,mwt,lemma',
                'dir': TEST_MODELS_DIR,
                'lang': 'en',
                'lemma_pos': True
            },
            # 1 expected exception
            [{
                'processor_type': 'LemmaProcessor',
                'processors_list': ['tokenize', 'lemma'],
                'provided_reqs': set(['tokenize', 'mwt']),
                'requires': set(['tokenize', 'pos'])
            }])
    ]
    # try to build each bad config, catch exceptions, check against gold
    pipeline_fails = 0
    for bad_config, gold_exceptions in bad_config_lists:
        try:
            stanza.Pipeline(**bad_config)
        except PipelineRequirementsException as e:
            pipeline_fails += 1
            assert isinstance(e, PipelineRequirementsException)
            assert len(e.processor_req_fails) == len(gold_exceptions)
            for processor_req_e, gold_exception in zip(e.processor_req_fails,
                                                       gold_exceptions):
                # compare the thrown ProcessorRequirementsExceptions against gold
                check_exception_vals(processor_req_e, gold_exception)
    # check pipeline building failed twice
    assert pipeline_fails == 2
Beispiel #15
0
 def __load_stanza_pipeline(self, model_folder: str, use_gpu: bool):
     logging.debug(
         "Starting loading the the Stanza models into the Pipeline!")
     self.stanza_pipeline = stanza.Pipeline(lang='et',
                                            processors='tokenize,pos,lemma',
                                            dir=model_folder,
                                            use_gpu=use_gpu)
     logging.debug("Finished loading the stanza models!")
Beispiel #16
0
def __initialize():
    global NLP
    APP_DIR = os.environ.get('APP_DIR')
    try:
        NLP = stanza.Pipeline('es',
                              verbose=False,
                              use_gpu=False,
                              dir=f'{APP_DIR}stanfordnlp_resources')
    except (ResourcesFileNotFoundError, FileNotFoundError) as ex:
        logging.info("Stanza: Descargando modelo 'es'")
        stanza.download('es',
                        verbose=False,
                        model_dir=f'{APP_DIR}stanfordnlp_resources')
        NLP = stanza.Pipeline('es',
                              verbose=False,
                              use_gpu=False,
                              dir=f'{APP_DIR}stanfordnlp_resources')
Beispiel #17
0
    def wrapper(self) -> list:
        nlp = stanza.Pipeline(
            lang=self._lang, processors=", ".join(self._processors), use_gpu=False,
        )

        object_stanza = nlp(" ".join(self._document))

        return json.loads(str(object_stanza))  # convert object stanza to object json.
def test_register_processor_variant():
    nlp = stanza.Pipeline(dir=TEST_MODELS_DIR,
                          lang='en',
                          processors={"tokenize": "lol"},
                          package=None)
    doc = nlp(EN_DOC)
    assert EN_DOC_LOL_TOKENS == '\n\n'.join(sent.tokens_string()
                                            for sent in doc.sentences)
Beispiel #19
0
def initStanzaPipeline(lang):
    downloadStanza(lang)
    global snlpInitialized
    global nlpStanza
    if not snlpInitialized:
        snlp = stanza.Pipeline(lang=lang)
        nlpStanza['snlp'] = StanzaLanguage(snlp)
        snlpInitialized = True
def stanza_pos_fct(sent_tok: list):
    # uses batches
    nlp_stanza = stanza.Pipeline(lang='en',
                                 processors='tokenize,pos',
                                 tokenize_pretokenized=True)
    pos_batch = [[(word.text, word.xpos) for word in s.words]
                 for s in nlp_stanza(sent_tok).sentences]
    return [item for sublist in pos_batch for item in sublist]
Beispiel #21
0
def main():
    #nlp = stanza.Pipeline('en',
    #                      processors='tokenize,pos,lemma,ner')
    nlp = stanza.Pipeline('en',
                          processors='tokenize')

    doc = nlp('Uro ruined modern.  Fortunately, Wotc banned him')
    print(process_doc(doc, "him", "ruined"))
 def __init__(self):
     self.nlp = stanza.Pipeline("id", use_gpu=False)
     self.stemmer = StemmerFactory().create_stemmer()
     self.ner = get_entities
     # Set POS Tagger
     self.pos_tagger = nltk.tag.CRFTagger()
     self.pos_tagger.set_model_file(
         'pretrained/pos_tagger/all_indo_man_tag_corpus_model.crf.tagger')
Beispiel #23
0
    def __init__(self):
        super().__init__()

        # stanza.download('ja')
        self.nlp = stanza.Pipeline('ja')
        self.jmd = Jamdict()

        self._translate_jmd_cache = {}
Beispiel #24
0
 def __init__(self, file, soup, test_csv):
     self.soup = soup
     self.document = test_csv
     self.s = re.split("_|\.", file)
     self.id = self.s[3]
     self.abstract = []
     self.nlp = stanza.Pipeline(lang='en', processors='tokenize')
     self.claims_start = 1500
Beispiel #25
0
def main():
    nlp = stanza.Pipeline('en',
                          processors='tokenize,pos,lemma,depparse')

    doc = nlp('Unban Mox Opal! Unban Mox Opal!')
    #print(doc.sentences[0].dependencies)
    print(doc)
    print(process_doc(doc, "{}=source >obj=zzz {}=target"))
Beispiel #26
0
def nlp(doc):
    """ Processes a text with spacy and stanza """

    snlp = stanza.Pipeline(lang="la")

    NLP = StanzaLanguage(snlp)

    return NLP(doc)
Beispiel #27
0
def _load_stanza(
    stanza_setup: Dict[str, str] = {
        "lang": "en",
        "package": "genia",
        "processors": {"ner": "bionlp13cg"},
    },
    use_gpu: bool = True,
) -> stanza.Pipeline:
    # TODO: [RICO -> put use_gpu inside one config]
    print("loading stanza", stanza_setup)
    try:
        snlp = stanza.Pipeline(**stanza_setup, use_gpu=use_gpu)
    except:
        stanza.download(**stanza_setup)
        snlp = stanza.Pipeline(**stanza_setup, use_gpu=use_gpu)

    return snlp
Beispiel #28
0
def test_zh_tokenizer_parens():
    """
    The original fix for newlines in Chinese text broke () in Chinese text
    """
    nlp = stanza.Pipeline(lang='zh',
                          processors="tokenize",
                          dir=TEST_MODELS_DIR)
    doc = nlp(ZH_PARENS_DOC)
def test_spacy_stanza_tokenizer_options():
    lang = "en"
    stanza.download(lang)
    snlp = stanza.Pipeline('en', processors={'tokenize': 'spacy'})
    nlp = StanzaLanguage(snlp)
    # whitespace tokens from spacy tokenizer are handled correctly
    doc = nlp(" Barack  Obama  was  born\n\nin Hawaii.")

    snlp = stanza.Pipeline('en', tokenize_pretokenized=True)
    nlp = StanzaLanguage(snlp)
    # pretokenized text is handled correctly (possibly with warnings because
    # the character offsets from stanza 1.0.0 are incorrect)
    doc = nlp(
        "Barack Obama was born in Hawaii.\nBarack Obama was born in Hawaii.")
    doc = nlp(
        " Barack  Obama  was  born\n\n in Hawaii.\nBarack Obama was born in Hawaii."
    )
Beispiel #30
0
    def set_language(self, lang=None):
        self.lang = lang
        if lang is None: return

        if not exists_file(home_dir() + '/stanza_resources/' + lang):
            stanza.download(lang)

        self.nlp = stanza.Pipeline(lang=lang, logging_level='ERROR')