Beispiel #1
0
def extract_ner_bio_sents(folder='test'):
    import os
    from tqdm import tqdm
    import stanza
    from spacy_stanza import StanzaLanguage
    snlp = stanza.Pipeline(lang="cs", logging_level='ERROR')
    nlp = StanzaLanguage(snlp)

    annt_path = os.path.join('original', folder, 'annotated')
    raw_path = os.path.join('original', folder, 'raw')
    annts = sorted(os.listdir(annt_path))
    annts = list(filter(lambda x: '.out' in x, annts))
    raws = sorted(os.listdir(raw_path))
    raws = list(filter(lambda x: '.txt' in x, raws))
    assert len(annts) == len(raws)

    results = []
    for a, r in tqdm(zip(annts, raws), total=len(raws)):
        aname = a.split('.')[0]
        rname = r.split('.')[0]
        assert aname == rname
        apath = os.path.join(annt_path, a)
        rpath = os.path.join(raw_path, r)
        results += merge_sents_and_ents(snlp, nlp, rpath, apath)
    return results
Beispiel #2
0
def create_model(vectors_loc=None,
                 lang=None,
                 stz=True,
                 vectors_name='fasttext',
                 max_items=-1):
    if lang is None or lang == 'sv' and not stz:
        nlp = Swedish()
    elif not stz:
        nlp = spacy.blank(lang)
    elif stz:
        stanza.download(lang)
        snlp = stanza.Pipeline(lang=lang)
        nlp = StanzaLanguage(snlp)

    with open(vectors_loc, 'rb') as file_:
        logger.info("Reading file '{}'".format(vectors_loc))
        header = file_.readline()
        nr_row, nr_dim = header.split(
        )  # the first line is number of tokens and dimensions
        counter = 0
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            if counter % 100 == 0:
                logger.info(counter)
            if counter == max_items:
                break
            counter = counter + 1
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = np.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
        nlp.vocab.vectors.name = vectors_name  # give vectors a name
    return nlp
Beispiel #3
0
def lemmatisaze_document(doc):

    nlp = StanzaLanguage(snlp)
    doc = nlp(doc)

    filtered_tokens = [token.lemma_ for token in doc]
    doc = ' '.join(filtered_tokens)
    return doc
Beispiel #4
0
def nlp(doc):
    """ Processes a text with spacy and stanza """

    snlp = stanza.Pipeline(lang="la")

    NLP = StanzaLanguage(snlp)

    return NLP(doc)
Beispiel #5
0
def initStanzaPipeline(lang):
    downloadStanza(lang)
    global snlpInitialized
    global nlpStanza
    if not snlpInitialized:
        snlp = stanza.Pipeline(lang=lang)
        nlpStanza['snlp'] = StanzaLanguage(snlp)
        snlpInitialized = True
Beispiel #6
0
def load_model(model = "en", group = "stanford"):
    if group == "stanford": 
        # lang_cls = get_lang_class("stanza_en")
        stanza.download('en')
        snlp = stanza.Pipeline(lang="en", use_gpu=True)
        nlp = StanzaLanguage(snlp)
    elif group == None:
        nlp = spacy.load(model)
    return nlp
Beispiel #7
0
 def stanza_model():
     from sagas.nlu.stanza_helper import get_nlp
     from spacy_stanza import StanzaLanguage
     if enable_pretoken:
         snlp = get_nlp(
             lang) if lang not in pretokenized_langs else get_nlp(
                 lang, pretokenized=True)
     else:
         snlp = get_nlp(lang)
     return StanzaLanguage(snlp)
Beispiel #8
0
 def __init__(self, anonymization: Anonymization, model: str):
     import stanza
     from spacy_stanza import StanzaLanguage
     
     stanza.download('en', processors='tokenize,pos,lemma,depparse,ner')  # will take a while - один раз достаточно запустить
     stanza.download('ru', processors='tokenize,pos,lemma,depparse,ner')  # will take a while - один раз достаточно запустить
     
     self.anonymization = anonymization
     self.snlp = stanza.Pipeline(lang=model, processors='tokenize,ner')
     self.processor = StanzaLanguage(self.snlp)
Beispiel #9
0
 def tokenize(self, text):
     # might be better to keep the Token object to serve string matching
     if self.sp_nlp is None:
         # self.sp_nlp = spacy.load("en_core_web_sm")
         snlp = stanza.Pipeline(lang="en", use_gpu=True)
         self.sp_nlp = StanzaLanguage(snlp)
     tokens = self.sp_nlp(text)
     if self.lemmatize:
         return [tok.lemma_.lower() for tok in tokens]
     else:
         return [tok.text.lower() for tok in tokens]
Beispiel #10
0
    def tokenize_and_spacy(self, text, lang="en"):
        """
        Keep meta information from spacy, used for matching
        """
        if BERTokenizer.sp_nlp is None:
            snlp = stanza.Pipeline(lang=lang,
                                   use_gpu=True,
                                   tokenize_pretokenized=True)
            BERTokenizer.sp_nlp = StanzaLanguage(snlp)

        tokens = self.tokenizer.encode(text).tokens[1:-1]
        return self.sp_nlp([tokens])
Beispiel #11
0
def extract_nes(jsonregests,
                check_if_saved=True,
                clean=lambda x: x,
                save_path="resources/ENTITIES.json",
                method="spacy",
                entity_types={}):
    """NLP processing for entity extraction
    
    1. Processes regest texts with spacy/stanza
    2. extracts named entities that are associated with places 
        (either places themselves or e.g, nobles X of Y, where Y is a place)
    3. writes the found entities to disk
    """
    if check_if_saved:
        if os.path.exists(save_path):
            with open(save_path, "r") as f:
                dat = json.load(f)
            #_maybe_convert_ent_dict_to_actual_format(dat)
            _clean_nes(dat, clean)
            return dat

    if method == "spacy":
        logging.info("loading spacy de_core_news_lg")
        nlp = spacy.load('de_core_news_lg')

        logging.info("spacy model loaded")

    elif method == "stanza":
        logging.info("loading stanza pipeline")
        import stanza
        from spacy_stanza import StanzaLanguage

        snlp = stanza.Pipeline(lang="de")
        nlp = StanzaLanguage(snlp)
        logging.info("stanza pipeline loaded")

    out = {}
    for i, jr in enumerate(jsonregests):
        key = jr["uri"]
        out[key] = _extract_nes(jr["regestentext_clean"],
                                nlp,
                                entity_types=entity_types)
        #print(jr["regestentext_clean"])
        if i % 1000 == 0:
            logging.info("{}/{} regests spacy processed".format(
                i, len(jsonregests)))
    _clean_nes(out, clean)

    with open(save_path, "w") as f:
        f.write(json.dumps(out))

    return out
Beispiel #12
0
    def __init__(self, models=None):  # noqa ANN201
        if not models:
            models = {"en": "en"}
        logger.debug(f"Loading Stanza models: {models.values()}")

        self.nlp = {
            lang_code: StanzaLanguage(
                stanza.Pipeline(
                    model_name,
                    processors="tokenize,pos,lemma,ner",
                ))
            for lang_code, model_name in models.items()
        }
Beispiel #13
0
 def tokenize_and_lemmatize(self, text, lang="en"):
     """
     This will be used for matching 
     1) remove cls and sep
     2) lemmatize 
     """
     if BERTokenizer.sp_nlp is None:
         snlp = stanza.Pipeline(lang=lang,
                                use_gpu=True,
                                tokenize_pretokenized=True)
         BERTokenizer.sp_nlp = StanzaLanguage(snlp)
     encodes = self._encode(text)
     tokens = encodes.tokens[1:-1]
     norm_tokens = [t.lemma_ for t in self.sp_nlp([tokens])]
     return norm_tokens
def build_df(file, V=None, vocab=None):
    processor_dict = {
        'tokenize': 'default',
        'pos': 'default',
        'ner': 'conll03',
        'lemma': 'default'
    }
    snlp = stanza.Pipeline(lang='en',
                           tokenize_pretokenized=True,
                           processors=processor_dict)
    nlp = StanzaLanguage(snlp)
    if vocab is not None:
        if type(vocab) == str:
            vocab = read_vectors(vocab)
    else:
        print(f'ERROR: please send with vector file or SpaCy vocab')
        sys.exit(-1)

    E = []
    F = []
    indices = [[], [], [], []]
    for sent_id, sent_str in tqdm(read_lines(file)):
        sent = nlp(sent_str, )
        persons = [ent for ent in sent.ents if ent.label_ == 'PER']
        orgs = [ent for ent in sent.ents if ent.label_ == 'ORG']
        for p, o in itertools.product(persons, orgs):
            features = extract_features(p, o, sent)
            F.append(features)

            embedding = np.hstack([get_vector(p, vocab), get_vector(o, vocab)])
            E.append(embedding)

            indices[0].append(sent_id)
            indices[1].append(p.text)
            indices[2].append(o.text)
            indices[3].append(f'( {sent.text} )')
    X, V = features2vectors(F, V)

    df = pd.concat([pd.DataFrame(E), pd.DataFrame(X)], axis=1)
    df.index = pd.MultiIndex.from_arrays(indices,
                                         names=('sent_id', 'person', 'org',
                                                'sent'))

    return df, V, vocab
Beispiel #15
0
def load_nlp():
    import stanza
    from spacy_stanza import StanzaLanguage
    nlp = stanza.Pipeline(lang='uk')
    snlp = StanzaLanguage(nlp)
    return nlp, snlp
Beispiel #16
0
 def tokenize(self, text):
     if self.sp_nlp is None:
         snlp = stanza.Pipeline(lang="zh", use_gpu=False)
         self.sp_nlp = StanzaLanguage(snlp)
     tokens = self.sp_nlp(text)
     return [token.lemma_ for token in tokens]
Beispiel #17
0
from transformers import MarianMTModel, MarianTokenizer

from sklearn.decomposition import PCA

REUTERS_DIRECTORY = 'data/reuters'
LANGUAGE_DIRECTORIES = {
    'en': os.path.join(REUTERS_DIRECTORY, "rcv1"),
    'es': os.path.join(REUTERS_DIRECTORY,
                       "RCV2_Multilingual_Corpus/spanish-latam"),
    'ru': os.path.join(REUTERS_DIRECTORY, "RCV2_Multilingual_Corpus/russian")
}
LANGUAGE_MODELS = {
    'en': lazy(lambda: spacy.load("en_core_web_lg")),
    'es': lazy(lambda: spacy.load("es_core_news_lg")),
    'ru': lazy(lambda: StanzaLanguage(stanza.Pipeline(lang="ru")))
}


def get_mt_tokenizer_and_model(model_name, device):
    return MarianTokenizer.from_pretrained(
        model_name), MarianMTModel.from_pretrained(model_name).to(device)


def load_stop_words(lang):
    stop_words = set()
    with open('data/stopwords/%s.txt' % lang, 'r') as f:
        for word in f.readlines():
            stop_words.add(word.strip())
    return stop_words
import stanza
from spacy_stanza import StanzaLanguage

models = {
    "craft":
    [('anatem', "ANATOMY"), ('bc5cdr', "CHEMICAL>DISEASE"),
     ('bc4chemd', "CHEMICAL"),
     ('bionlp13cg',
      "AMINO_ACID>ANATOMICAL_SYSTEM>CANCER>CELL>CELLULAR_COMPONENT>DEVELOPING_ANATOMICAL_STRUCTURE>GENE_OR_GENE_PRODUCT>IMMATERIAL_ANATOMICAL_ENTITY>MULTI-TISSUE_STRUCTURE>ORGAN>ORGANISM>ORGANISM_SUBDIVISION>ORGANISM_SUBSTANCE>PATHOLOGICAL_FORMATION>SIMPLE_CHEMICAL>TISSUE"
      ), ('jnlpba', "PROTEIN>DNA>RNA>CELL_LINE>CELL_TYPE"),
     ('linnaeus', "SPECIES"), ('ncbi_disease', "DISEASE"),
     ('s800', "SPECIES")],
    "mimic":
    [("i2b2", "PROBLEM>TEST>TREATMENT"),
     ("radiology",
      "ANATOMY>OBSERVATION>ANATOMY_MODIFIER>OBSERVATION_MODIFIER>UNCERTAINTY")]
}

for ud in models.keys():
    for model in models[ud]:
        stanzaModel = stanza.Pipeline(
            lang="en",  # package=ud,
            processors={
                "tokenize": "spacy",
                "ner": model[0]
            })

        nerModel = StanzaLanguage(stanzaModel)
        nerModel.to_disk("model/" + model[0])
Beispiel #19
0
 def __init__(self):
     snlp = stanza.Pipeline(lang="es")
     self.nlp = StanzaLanguage(snlp)
Beispiel #20
0
    'C:/Users/user/Jupyter/Hakaton/hackathon_order_fix2.csv', escapechar='\\')
#print(hackathon_order['comment'])

from wiki_ru_wordnet import WikiWordnet
wikiwordnet = WikiWordnet()

from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()

from spacy_stanza import StanzaLanguage
import stanza

stanza.download('ru')
stanza_nlp = stanza.Pipeline('ru')
snlp = stanza.Pipeline(lang="ru")
nlp = StanzaLanguage(snlp)


def remove_special_characters(text):
    text = re.sub(r'[^а-яА-ЯёЁ\s]', '', text, re.I | re.A)
    return text


def lemmatisaze_document(doc):

    nlp = StanzaLanguage(snlp)
    doc = nlp(doc)

    filtered_tokens = [token.lemma_ for token in doc]
    doc = ' '.join(filtered_tokens)
    return doc
Beispiel #21
0
def init_parser(
    parser: str = "spacy",
    model_or_lang: str = "en",
    *,
    is_tokenized: bool = False,
    disable_sbd: bool = False,
    parser_opts: Optional[Dict] = None,
    **kwargs,
) -> Language:
    """Initialise a spacy-wrapped parser given a language or model and some options.
    :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are
           'spacy', 'stanfordnlp', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be
           installed, e.g. spacy-stanza. Defaults to 'spacy'
    :param model_or_lang: language model to use (must be installed). Defaults to an English model
    :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). For stanza and
           stanfordnlp, this will also cause sentence segmentation *only* to be done by splitting on new lines.
           See the documentation: https://stanfordnlp.github.io/stanfordnlp/tokenize.html
           See the documentation: https://stanfordnlp.github.io/stanza/tokenize.html
    :param disable_sbd: disables spaCy automatic sentence boundary detection (only works for spaCy)
    :param parser_opts: will be passed to the core pipeline. For spacy and udpipe, it will be passed to their
           `.load()` initialisations, for stanfordnlp and stanza `pipeline_opts` is passed to to their `.Pipeline()`
           initialisations
    :param kwargs: options to be passed to the ConllFormatter initialisation
    :return: an initialised Language object; the parser
    """
    parser_opts = {} if parser_opts is None else parser_opts

    if parser == "spacy":
        nlp = spacy.load(model_or_lang, **parser_opts)
        if is_tokenized:
            nlp.tokenizer = SpacyPretokenizedTokenizer(nlp.vocab)
        if disable_sbd:
            nlp.add_pipe(_prevent_sbd, name="prevent-sbd", before="parser")
    elif parser == "stanfordnlp":
        from spacy_stanfordnlp import StanfordNLPLanguage
        import stanfordnlp

        snlp = stanfordnlp.Pipeline(lang=model_or_lang,
                                    tokenize_pretokenized=is_tokenized,
                                    **parser_opts)
        nlp = StanfordNLPLanguage(snlp)
    elif parser == "stanza":
        import stanza
        from spacy_stanza import StanzaLanguage

        snlp = stanza.Pipeline(lang=model_or_lang,
                               tokenize_pretokenized=is_tokenized,
                               **parser_opts)
        nlp = StanzaLanguage(snlp)
    elif parser == "udpipe":
        import spacy_udpipe

        nlp = spacy_udpipe.load(model_or_lang, **parser_opts)
    else:
        raise ValueError(
            "Unexpected value for 'parser'. Options are: 'spacy', 'stanfordnlp', 'stanza', 'udpipe'"
        )

    conllformatter = ConllFormatter(nlp, **kwargs)
    nlp.add_pipe(conllformatter, last=True)

    return nlp
Beispiel #22
0
# coding=utf-8
import pke
import stanza
import json
from analizer import KeywordExtractor
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from spacy_stanza import StanzaLanguage
import re

snlp = stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma')
spacy_pipelines = StanzaLanguage(snlp)

ps_ru = SnowballStemmer("russian")
ps_eng = SnowballStemmer("english")

DATASET = 'mlong'
DATASET_FILE = "./cyberleninka_" + DATASET + ".txt"

with open(DATASET_FILE, encoding='utf-8') as fp:
    datafiles = json.load(fp)

datafiles = [d for d in datafiles if d is not None]
print(len(datafiles))
START_ELE = 0
datafiles = datafiles[START_ELE:]
count = 0

true_positive = 0
Beispiel #23
0
models = {
    SupportedLanguages.English:
    spacy.load(spacy_models[SupportedLanguages.English]),
    SupportedLanguages.German:
    spacy.load(spacy_models[SupportedLanguages.German]),
    SupportedLanguages.Danish:
    spacy.load(spacy_models[SupportedLanguages.Danish]),
    SupportedLanguages.Dutch:
    spacy.load(spacy_models[SupportedLanguages.Dutch]),
    SupportedLanguages.Italian:
    spacy.load(spacy_models[SupportedLanguages.Italian]),
    SupportedLanguages.Portuguese:
    spacy.load(spacy_models[SupportedLanguages.Portuguese]),
    # SupportedLanguages.Russian: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="ru")),
    SupportedLanguages.Serbian:
    StanzaLanguage(stanza.Pipeline(lang="sr")),
    # SupportedLanguages.Bulgarian: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="bg")),
    SupportedLanguages.Slovene:
    StanzaLanguage(stanza.Pipeline(lang="sl"))
    # SupportedLanguages.Hungarian: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="hu")),
    # SupportedLanguages.Estonian: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="et")),
    # SupportedLanguages.Basque: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="eu"))
    # SupportedLanguages.Irish: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="ga")),
}

logger.info('loaded vocabulary\n')


def lemmatizer(doc, spacy_model):
    if type(doc) == float:
        return spacy_model.make_doc("")