def extract_ner_bio_sents(folder='test'): import os from tqdm import tqdm import stanza from spacy_stanza import StanzaLanguage snlp = stanza.Pipeline(lang="cs", logging_level='ERROR') nlp = StanzaLanguage(snlp) annt_path = os.path.join('original', folder, 'annotated') raw_path = os.path.join('original', folder, 'raw') annts = sorted(os.listdir(annt_path)) annts = list(filter(lambda x: '.out' in x, annts)) raws = sorted(os.listdir(raw_path)) raws = list(filter(lambda x: '.txt' in x, raws)) assert len(annts) == len(raws) results = [] for a, r in tqdm(zip(annts, raws), total=len(raws)): aname = a.split('.')[0] rname = r.split('.')[0] assert aname == rname apath = os.path.join(annt_path, a) rpath = os.path.join(raw_path, r) results += merge_sents_and_ents(snlp, nlp, rpath, apath) return results
def create_model(vectors_loc=None, lang=None, stz=True, vectors_name='fasttext', max_items=-1): if lang is None or lang == 'sv' and not stz: nlp = Swedish() elif not stz: nlp = spacy.blank(lang) elif stz: stanza.download(lang) snlp = stanza.Pipeline(lang=lang) nlp = StanzaLanguage(snlp) with open(vectors_loc, 'rb') as file_: logger.info("Reading file '{}'".format(vectors_loc)) header = file_.readline() nr_row, nr_dim = header.split( ) # the first line is number of tokens and dimensions counter = 0 nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: if counter % 100 == 0: logger.info(counter) if counter == max_items: break counter = counter + 1 line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', int(nr_dim)) word = pieces[0] vector = np.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector) # add the vectors to the vocab nlp.vocab.vectors.name = vectors_name # give vectors a name return nlp
def lemmatisaze_document(doc): nlp = StanzaLanguage(snlp) doc = nlp(doc) filtered_tokens = [token.lemma_ for token in doc] doc = ' '.join(filtered_tokens) return doc
def nlp(doc): """ Processes a text with spacy and stanza """ snlp = stanza.Pipeline(lang="la") NLP = StanzaLanguage(snlp) return NLP(doc)
def initStanzaPipeline(lang): downloadStanza(lang) global snlpInitialized global nlpStanza if not snlpInitialized: snlp = stanza.Pipeline(lang=lang) nlpStanza['snlp'] = StanzaLanguage(snlp) snlpInitialized = True
def load_model(model = "en", group = "stanford"): if group == "stanford": # lang_cls = get_lang_class("stanza_en") stanza.download('en') snlp = stanza.Pipeline(lang="en", use_gpu=True) nlp = StanzaLanguage(snlp) elif group == None: nlp = spacy.load(model) return nlp
def stanza_model(): from sagas.nlu.stanza_helper import get_nlp from spacy_stanza import StanzaLanguage if enable_pretoken: snlp = get_nlp( lang) if lang not in pretokenized_langs else get_nlp( lang, pretokenized=True) else: snlp = get_nlp(lang) return StanzaLanguage(snlp)
def __init__(self, anonymization: Anonymization, model: str): import stanza from spacy_stanza import StanzaLanguage stanza.download('en', processors='tokenize,pos,lemma,depparse,ner') # will take a while - один раз достаточно запустить stanza.download('ru', processors='tokenize,pos,lemma,depparse,ner') # will take a while - один раз достаточно запустить self.anonymization = anonymization self.snlp = stanza.Pipeline(lang=model, processors='tokenize,ner') self.processor = StanzaLanguage(self.snlp)
def tokenize(self, text): # might be better to keep the Token object to serve string matching if self.sp_nlp is None: # self.sp_nlp = spacy.load("en_core_web_sm") snlp = stanza.Pipeline(lang="en", use_gpu=True) self.sp_nlp = StanzaLanguage(snlp) tokens = self.sp_nlp(text) if self.lemmatize: return [tok.lemma_.lower() for tok in tokens] else: return [tok.text.lower() for tok in tokens]
def tokenize_and_spacy(self, text, lang="en"): """ Keep meta information from spacy, used for matching """ if BERTokenizer.sp_nlp is None: snlp = stanza.Pipeline(lang=lang, use_gpu=True, tokenize_pretokenized=True) BERTokenizer.sp_nlp = StanzaLanguage(snlp) tokens = self.tokenizer.encode(text).tokens[1:-1] return self.sp_nlp([tokens])
def extract_nes(jsonregests, check_if_saved=True, clean=lambda x: x, save_path="resources/ENTITIES.json", method="spacy", entity_types={}): """NLP processing for entity extraction 1. Processes regest texts with spacy/stanza 2. extracts named entities that are associated with places (either places themselves or e.g, nobles X of Y, where Y is a place) 3. writes the found entities to disk """ if check_if_saved: if os.path.exists(save_path): with open(save_path, "r") as f: dat = json.load(f) #_maybe_convert_ent_dict_to_actual_format(dat) _clean_nes(dat, clean) return dat if method == "spacy": logging.info("loading spacy de_core_news_lg") nlp = spacy.load('de_core_news_lg') logging.info("spacy model loaded") elif method == "stanza": logging.info("loading stanza pipeline") import stanza from spacy_stanza import StanzaLanguage snlp = stanza.Pipeline(lang="de") nlp = StanzaLanguage(snlp) logging.info("stanza pipeline loaded") out = {} for i, jr in enumerate(jsonregests): key = jr["uri"] out[key] = _extract_nes(jr["regestentext_clean"], nlp, entity_types=entity_types) #print(jr["regestentext_clean"]) if i % 1000 == 0: logging.info("{}/{} regests spacy processed".format( i, len(jsonregests))) _clean_nes(out, clean) with open(save_path, "w") as f: f.write(json.dumps(out)) return out
def __init__(self, models=None): # noqa ANN201 if not models: models = {"en": "en"} logger.debug(f"Loading Stanza models: {models.values()}") self.nlp = { lang_code: StanzaLanguage( stanza.Pipeline( model_name, processors="tokenize,pos,lemma,ner", )) for lang_code, model_name in models.items() }
def tokenize_and_lemmatize(self, text, lang="en"): """ This will be used for matching 1) remove cls and sep 2) lemmatize """ if BERTokenizer.sp_nlp is None: snlp = stanza.Pipeline(lang=lang, use_gpu=True, tokenize_pretokenized=True) BERTokenizer.sp_nlp = StanzaLanguage(snlp) encodes = self._encode(text) tokens = encodes.tokens[1:-1] norm_tokens = [t.lemma_ for t in self.sp_nlp([tokens])] return norm_tokens
def build_df(file, V=None, vocab=None): processor_dict = { 'tokenize': 'default', 'pos': 'default', 'ner': 'conll03', 'lemma': 'default' } snlp = stanza.Pipeline(lang='en', tokenize_pretokenized=True, processors=processor_dict) nlp = StanzaLanguage(snlp) if vocab is not None: if type(vocab) == str: vocab = read_vectors(vocab) else: print(f'ERROR: please send with vector file or SpaCy vocab') sys.exit(-1) E = [] F = [] indices = [[], [], [], []] for sent_id, sent_str in tqdm(read_lines(file)): sent = nlp(sent_str, ) persons = [ent for ent in sent.ents if ent.label_ == 'PER'] orgs = [ent for ent in sent.ents if ent.label_ == 'ORG'] for p, o in itertools.product(persons, orgs): features = extract_features(p, o, sent) F.append(features) embedding = np.hstack([get_vector(p, vocab), get_vector(o, vocab)]) E.append(embedding) indices[0].append(sent_id) indices[1].append(p.text) indices[2].append(o.text) indices[3].append(f'( {sent.text} )') X, V = features2vectors(F, V) df = pd.concat([pd.DataFrame(E), pd.DataFrame(X)], axis=1) df.index = pd.MultiIndex.from_arrays(indices, names=('sent_id', 'person', 'org', 'sent')) return df, V, vocab
def load_nlp(): import stanza from spacy_stanza import StanzaLanguage nlp = stanza.Pipeline(lang='uk') snlp = StanzaLanguage(nlp) return nlp, snlp
def tokenize(self, text): if self.sp_nlp is None: snlp = stanza.Pipeline(lang="zh", use_gpu=False) self.sp_nlp = StanzaLanguage(snlp) tokens = self.sp_nlp(text) return [token.lemma_ for token in tokens]
from transformers import MarianMTModel, MarianTokenizer from sklearn.decomposition import PCA REUTERS_DIRECTORY = 'data/reuters' LANGUAGE_DIRECTORIES = { 'en': os.path.join(REUTERS_DIRECTORY, "rcv1"), 'es': os.path.join(REUTERS_DIRECTORY, "RCV2_Multilingual_Corpus/spanish-latam"), 'ru': os.path.join(REUTERS_DIRECTORY, "RCV2_Multilingual_Corpus/russian") } LANGUAGE_MODELS = { 'en': lazy(lambda: spacy.load("en_core_web_lg")), 'es': lazy(lambda: spacy.load("es_core_news_lg")), 'ru': lazy(lambda: StanzaLanguage(stanza.Pipeline(lang="ru"))) } def get_mt_tokenizer_and_model(model_name, device): return MarianTokenizer.from_pretrained( model_name), MarianMTModel.from_pretrained(model_name).to(device) def load_stop_words(lang): stop_words = set() with open('data/stopwords/%s.txt' % lang, 'r') as f: for word in f.readlines(): stop_words.add(word.strip()) return stop_words
import stanza from spacy_stanza import StanzaLanguage models = { "craft": [('anatem', "ANATOMY"), ('bc5cdr', "CHEMICAL>DISEASE"), ('bc4chemd', "CHEMICAL"), ('bionlp13cg', "AMINO_ACID>ANATOMICAL_SYSTEM>CANCER>CELL>CELLULAR_COMPONENT>DEVELOPING_ANATOMICAL_STRUCTURE>GENE_OR_GENE_PRODUCT>IMMATERIAL_ANATOMICAL_ENTITY>MULTI-TISSUE_STRUCTURE>ORGAN>ORGANISM>ORGANISM_SUBDIVISION>ORGANISM_SUBSTANCE>PATHOLOGICAL_FORMATION>SIMPLE_CHEMICAL>TISSUE" ), ('jnlpba', "PROTEIN>DNA>RNA>CELL_LINE>CELL_TYPE"), ('linnaeus', "SPECIES"), ('ncbi_disease', "DISEASE"), ('s800', "SPECIES")], "mimic": [("i2b2", "PROBLEM>TEST>TREATMENT"), ("radiology", "ANATOMY>OBSERVATION>ANATOMY_MODIFIER>OBSERVATION_MODIFIER>UNCERTAINTY")] } for ud in models.keys(): for model in models[ud]: stanzaModel = stanza.Pipeline( lang="en", # package=ud, processors={ "tokenize": "spacy", "ner": model[0] }) nerModel = StanzaLanguage(stanzaModel) nerModel.to_disk("model/" + model[0])
def __init__(self): snlp = stanza.Pipeline(lang="es") self.nlp = StanzaLanguage(snlp)
'C:/Users/user/Jupyter/Hakaton/hackathon_order_fix2.csv', escapechar='\\') #print(hackathon_order['comment']) from wiki_ru_wordnet import WikiWordnet wikiwordnet = WikiWordnet() from nltk.tokenize.toktok import ToktokTokenizer tokenizer = ToktokTokenizer() from spacy_stanza import StanzaLanguage import stanza stanza.download('ru') stanza_nlp = stanza.Pipeline('ru') snlp = stanza.Pipeline(lang="ru") nlp = StanzaLanguage(snlp) def remove_special_characters(text): text = re.sub(r'[^а-яА-ЯёЁ\s]', '', text, re.I | re.A) return text def lemmatisaze_document(doc): nlp = StanzaLanguage(snlp) doc = nlp(doc) filtered_tokens = [token.lemma_ for token in doc] doc = ' '.join(filtered_tokens) return doc
def init_parser( parser: str = "spacy", model_or_lang: str = "en", *, is_tokenized: bool = False, disable_sbd: bool = False, parser_opts: Optional[Dict] = None, **kwargs, ) -> Language: """Initialise a spacy-wrapped parser given a language or model and some options. :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are 'spacy', 'stanfordnlp', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be installed, e.g. spacy-stanza. Defaults to 'spacy' :param model_or_lang: language model to use (must be installed). Defaults to an English model :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). For stanza and stanfordnlp, this will also cause sentence segmentation *only* to be done by splitting on new lines. See the documentation: https://stanfordnlp.github.io/stanfordnlp/tokenize.html See the documentation: https://stanfordnlp.github.io/stanza/tokenize.html :param disable_sbd: disables spaCy automatic sentence boundary detection (only works for spaCy) :param parser_opts: will be passed to the core pipeline. For spacy and udpipe, it will be passed to their `.load()` initialisations, for stanfordnlp and stanza `pipeline_opts` is passed to to their `.Pipeline()` initialisations :param kwargs: options to be passed to the ConllFormatter initialisation :return: an initialised Language object; the parser """ parser_opts = {} if parser_opts is None else parser_opts if parser == "spacy": nlp = spacy.load(model_or_lang, **parser_opts) if is_tokenized: nlp.tokenizer = SpacyPretokenizedTokenizer(nlp.vocab) if disable_sbd: nlp.add_pipe(_prevent_sbd, name="prevent-sbd", before="parser") elif parser == "stanfordnlp": from spacy_stanfordnlp import StanfordNLPLanguage import stanfordnlp snlp = stanfordnlp.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized, **parser_opts) nlp = StanfordNLPLanguage(snlp) elif parser == "stanza": import stanza from spacy_stanza import StanzaLanguage snlp = stanza.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized, **parser_opts) nlp = StanzaLanguage(snlp) elif parser == "udpipe": import spacy_udpipe nlp = spacy_udpipe.load(model_or_lang, **parser_opts) else: raise ValueError( "Unexpected value for 'parser'. Options are: 'spacy', 'stanfordnlp', 'stanza', 'udpipe'" ) conllformatter = ConllFormatter(nlp, **kwargs) nlp.add_pipe(conllformatter, last=True) return nlp
# coding=utf-8 import pke import stanza import json from analizer import KeywordExtractor from nltk.stem.snowball import SnowballStemmer from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords import string from spacy_stanza import StanzaLanguage import re snlp = stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma') spacy_pipelines = StanzaLanguage(snlp) ps_ru = SnowballStemmer("russian") ps_eng = SnowballStemmer("english") DATASET = 'mlong' DATASET_FILE = "./cyberleninka_" + DATASET + ".txt" with open(DATASET_FILE, encoding='utf-8') as fp: datafiles = json.load(fp) datafiles = [d for d in datafiles if d is not None] print(len(datafiles)) START_ELE = 0 datafiles = datafiles[START_ELE:] count = 0 true_positive = 0
models = { SupportedLanguages.English: spacy.load(spacy_models[SupportedLanguages.English]), SupportedLanguages.German: spacy.load(spacy_models[SupportedLanguages.German]), SupportedLanguages.Danish: spacy.load(spacy_models[SupportedLanguages.Danish]), SupportedLanguages.Dutch: spacy.load(spacy_models[SupportedLanguages.Dutch]), SupportedLanguages.Italian: spacy.load(spacy_models[SupportedLanguages.Italian]), SupportedLanguages.Portuguese: spacy.load(spacy_models[SupportedLanguages.Portuguese]), # SupportedLanguages.Russian: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="ru")), SupportedLanguages.Serbian: StanzaLanguage(stanza.Pipeline(lang="sr")), # SupportedLanguages.Bulgarian: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="bg")), SupportedLanguages.Slovene: StanzaLanguage(stanza.Pipeline(lang="sl")) # SupportedLanguages.Hungarian: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="hu")), # SupportedLanguages.Estonian: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="et")), # SupportedLanguages.Basque: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="eu")) # SupportedLanguages.Irish: StanfordNLPLanguage(stanfordnlp.Pipeline(lang="ga")), } logger.info('loaded vocabulary\n') def lemmatizer(doc, spacy_model): if type(doc) == float: return spacy_model.make_doc("")