def create_udpipe_pipeline(lang: str) -> UDPipeLanguage: try: pipeline = spacy_udpipe.load(lang) except: spacy_udpipe.download(lang) pipeline = spacy_udpipe.load(lang) if pipeline is None: del pipeline raise ValueError('The `{0}` language cannot be loaded for the UDPipe!') return pipeline
def __init__(self, lang='EN'): self.lang = lang self.tokenize_functions = { "EN": self._tokenizeDataEN, "CS": self._tokenizeDataCS } self.is_trained = False # todo self.nlp = spacy_udpipe.load("en") if lang == 'EN' else spacy_udpipe.load("cs") self.default_stop_words = False self.stop_words = [] self._vectorizer = None
def test_pipe(lang: str) -> None: nlp = load(lang=lang) text = "spacy-udpipe still does not support multiprocess execution." doc = nlp(text) del nlp nlp = load(lang=lang) texts = [text for _ in range(2)] docs = list(nlp.pipe(texts, n_process=-1)) assert len(docs) == len(texts) assert docs[0].to_json() == doc.to_json() assert docs[-1].to_json() == doc.to_json()
def get_pos_ud_head(training_data, lang='en', document=None): try: ud_model = spacy_udpipe.load(lang) print(lang, ' model is used.') except Exception: spacy_udpipe.download(lang) print('downloaded model: ', lang) ud_model = spacy_udpipe.load(lang) sent_pos = [] sent_ud = [] sent_head = [] sent_tok = [] # get pos and ud tag for line in training_data: #print(line) temp_pos = [] temp_ud = [] temp_head = [] temp_tok = [] #print(line) tag_sent = ud_model(line) for i, token in enumerate(tag_sent): temp_pos.append(token.pos_) temp_ud.append(token.dep_) temp_tok.append(token.text) # if token.head == token: # head = 0 # temp_head.append(head) # print(token, token.head, head) # else: head = token.head.i - tag_sent[0].i + 1 temp_head.append(head) #print(token, token.head, head) sent_pos.append(temp_pos) sent_ud.append(temp_ud) sent_head.append(temp_head) sent_tok.append(temp_tok) return sent_pos, sent_ud, sent_head, sent_tok
def test_serialization(lang: str) -> None: with tempfile.TemporaryDirectory() as tdir: nlp = load(lang=lang) nlp.to_disk(tdir) udpipe_model = UDPipeModel(lang=lang) nlp = spacy.load(tdir, udpipe_model=udpipe_model)
def main(): docs_dir = '/home/zal/Devel/OperaSpNLP/docs' output_dir = '/home/zal/Devel/OperaSpNLP/output/udpipe/ner' file_ext = 'txt' nlp = spacy_udpipe.load('es-ancora') entity_dict = defaultdict(list) for file_path in glob(osp.join(docs_dir, f'*.{file_ext}')): doc_text = unidecode(open(file_path, 'r').read()) doc = nlp(doc_text) for ent in doc.ents: entity_dict[ent.label_].append(ent.text) print(f'Total labels: {len(entity_dict.keys())}') print(f'{entity_dict.keys()}') for ner_label in entity_dict.keys(): print( f'===================================={ner_label}: {len(entity_dict[ner_label])}' ) ner_label_count_tuple = count_and_tuple(entity_dict[ner_label]) print_count_tuple(ner_label_count_tuple) save_path = osp.join(output_dir, f'{ner_label}.out') save_count_tuple(ner_label_count_tuple, save_path)
def _default_tagger(self): try: import spacy_udpipe except ImportError: raise ( 'You are missing pos tagger, try `pip install spacy_udpipe`') spacy_udpipe.download('en') return spacy_udpipe.load('en')
def _init(lang='cs', download_model=None): # ('cs', 'cs-pdt', 'cs-cac', 'cs-fictree', 'cs-cltt'): if download_model: spacy_udpipe.download(download_model) nlp = spacy_udpipe.load(lang) syllables = SpacySyllables(nlp, lang=lang) nlp.add_pipe(syllables) return nlp
def test_morph_exception() -> None: assert spacy.__version__ <= SPACY_VERSION lang = RO text = "Ce mai faci?" download(lang=lang) try: nlp = load(lang=lang) assert nlp._meta["lang"] == f"udpipe_{lang}" doc = nlp(text) except ValueError: nlp = load(lang=lang, ignore_tag_map=True) assert nlp._meta["lang"] == f"udpipe_{lang}" doc = nlp(text) assert doc
def test_feats() -> None: lang = RU text = "Я люблю машинное обучение." download(lang=lang) nlp = load(lang=lang) assert nlp._meta["lang"] == f"udpipe_{lang}" doc = nlp(text) assert doc[2]._.feats == "Case=Acc|Degree=Pos|Gender=Neut|Number=Sing"
def __init__(self): with resources.path("src.resources", "it-sentiment_lexicon.lmf.xml") as bad_words: self.__bad_words = { word.rstrip().lower() for word in open(bad_words, 'r', encoding='utf8') if word.rstrip().lower() != '' } self.__stemmer = SnowballStemmer('italian') self.__nlp = spacy_udpipe.load("it-postwita")
def test_serialization(lang: str) -> None: with tempfile.TemporaryDirectory() as tdir: nlp = load(lang=lang) doc = nlp("A simple sentence.") nlp.to_disk(tdir) del nlp nlp = spacy.load(tdir) same_doc = nlp("A simple sentence.") assert doc.to_json() == same_doc.to_json()
def __init__(self, lang_or_model, nlp_str): if nlp_str == "stanza": self.nlp = stanza.Pipeline( lang_or_model, processors='tokenize,pos,lemma,depparse', use_gpu=True, pos_batch_size=2000, depparse_batch_size=2000) elif nlp_str == "udpipe": self.nlp = spacy_udpipe.load(lang_or_model) self.tagmap = self.nlp.vocab.morphology.tag_map
def main(path_to_input, out_dir=''): """ Generate plot for an input file or multiple files in a folder. """ nlp = spacy_udpipe.load('nb') if not os.path.isdir(out_dir): os.mkdir(out_dir) if os.path.isdir(path_to_input): # folder as input for rel_f in sorted(os.listdir(path_to_input)): file_name, ext = os.path.splitext(rel_f) out_f = os.path.join(out_dir, file_name + ".png") if ext == ".uio": path_to_file = os.path.join(path_to_input, rel_f) pedigree = rel_to_linkage(path_to_file, nlp, out_dir) linkage_f = os.path.join(out_dir, file_name + '.ped') names = ",".join(list(pedigree.id_mapping.keys())) #linkage_to_plot(linkage_f, out_f, names) # TO DO: finish else: # file as input path, file_name = os.path.split(path_to_input) file_name, ext = os.path.splitext(file_name) linkage_f = os.path.join(out_dir, file_name + '.ped') out_f = os.path.join(out_dir, file_name + ".png") if path_to_input.endswith( '.uio'): # TO DO: load pedigree obj from .pkl pedigree = rel_to_linkage(path_to_input, nlp, out_dir) names = ",".join(list(pedigree.id_mapping.keys())) linkage_to_plot(linkage_f, out_f, names) elif file_name == 'example1_gold': # get a gold standard plot id_mapping = { 'pasient': 1, 'mor': 2, 'far': 5, 'farmor': 6, 'farfar': 7, 'farbror': 8, 'søster': 9, 'fetter': 10, 'fetter2': 11, 'tante': 12, 'bror': 13, 'barn': 14, 'partner': 15, 'barn2': 16 } names = ",".join(list(id_mapping.keys())) linkage_to_plot(linkage_f, out_f, names) elif path_to_input.endswith('.ped'): print('in gen') linkage_to_plot(linkage_f, out_f) # TO DO: parse additional arg with names? else: raise ValueError( 'Input file(s) must have the extension .ped or .uio')
def get_doc(language="ar", size=1): PATH = "/Users/abdulrahimqaddoumi/Desktop/" + language clean_texts = generate_clean_text(PATH) spacy_udpipe.download(language) nlp = spacy_udpipe.load(language) text_length = len(clean_texts) hundredth = text_length // 100 start_time = time.time() for i in range(size): print(i * hundredth, (1+i) * hundredth) doc = nlp(clean_texts[:100000]) print("--- %s seconds ---" % (time.time() - start_time)) return doc
def get_udpipe_parser(lang='ru'): global udpipe_nlp if udpipe_nlp.get(lang, None) is None: try: spacy_udpipe.download(lang) # download Russian model udpipe_nlp[lang] = spacy_udpipe.load("ru") # nlp.add_pipe(nlp.create_pipe('sentencizer')) except: print(f'error loading udpipe model for {lang}') # parser_nlp.add_pipe(parser_nlp.create_pipe('sentencizer')) return udpipe_nlp[lang]
def compute_gram_diversity(sentences, lang="en", system_name="", freq_voc=None): ''' Computing metric :param metric_func: get_bleu or get_ter_multeval :param sys: the sampled sentences from the translation :param sample_idxs: indexes for the sample (list) :param iters: number of iterations :returns: a socre (float) ''' nlpD = spacy_udpipe.load(lang).tokenizer nlpD.max_length = 300000000 lemmas = get_lemmas(sentences, nlpD, system_name, freq_voc) return (compute_simpDiv(lemmas), compute_invSimpDiv(lemmas), compute_shannonDiv(lemmas))
def test_spacy_udpipe(lang: str) -> None: nlp = load(lang=lang) text = "Attention aux articles contractés!" doc = nlp(text=text) assert [t.orth_ for t in doc ] == ["Attention", "à", "les", "articles", "contractés", "!"] pos = [{"INTJ", "NOUN"}, {"ADP"}, {"DET"}, {"NOUN"}, {"VERB", "ADJ"}, {"PUNCT"}] for i, t in enumerate(doc): assert t.pos_ in pos[i] assert [t.head.i for t in doc] == [0, 3, 3, 0, 3, 0] dep = [{"ROOT", "root"}, {"case"}, {"det"}, {"nmod", "obl", "obl:arg"}, {"acl", "amod"}, {"punct"}] for i, t in enumerate(doc): assert t.dep_ in dep[i]
def main(): docs_dir = '/home/zal/Devel/OperaSpNLP/docs' output_dir = '/home/zal/Devel/OperaSpNLP/output/udpipe/pos' file_ext = 'txt' nlp = spacy_udpipe.load('es-ancora') neuralcoref.add_to_pipe(nlp) pos_dict = defaultdict(list) for file_path in glob(osp.join(docs_dir, f'*.{file_ext}')): doc_text = unidecode(open(file_path, 'r').read()) doc = nlp(doc_text) for token in doc: pos_dict[token.pos_].append(token.text) for pos_tag in pos_dict.keys(): print(f'{pos_tag}: {len(pos_dict[pos_tag])}') save_path = osp.join(output_dir, f'{pos_tag}.out') with open(save_path, 'w') as fout: fout.write('\n'.join([word for word in pos_dict[pos_tag]]))
import spacy_udpipe import pandas as pd from pprint import pprint import pickle spacy_udpipe.download("en") # download English model nlp = spacy_udpipe.load("en") #text = "Wikipedia is a free online encyclopedia, created and edited by volunteers around the world. Das ist ein zweiter Satz." #doc = nlp(text) #sentences = [sent.string.strip() for sent in doc.sents] #print(sentences) # with open('../bert_final/reviews_as_raw_text.txt') as fopen: # reviews = fopen.read().split('\n')[:-1] # #print(reviews) # df = pd.DataFrame({"review_text":reviews}) # # #print(df) # # testliste = [] # testliste2 = [] # for index, review in enumerate(df["review_text"]): # rev_doc = nlp(review) # testliste2 = ([sent.string.strip() for sent in rev_doc.sents]) # #print(testliste2) # testliste.append(testliste2) # pprint(testliste) # with open('outfile', 'wb') as fp: # pickle.dump(testliste, fp)
import os import re import stanza import spacy_udpipe EXTERNAL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'external_data') nlp_udpipe = spacy_udpipe.load(lang="hy") nlp_stanza = stanza.Pipeline(use_gpu=False, lang='hy', processors='tokenize, mwt, pos, lemma, depparse') def lemmatizer(text: str): doc = nlp_stanza(text) return [ word.lemma for sentence in doc.sentences for word in sentence.words ] def pos_tagger(text: str): doc = nlp_stanza(text) return [word.pos for sentence in doc.sentences for word in sentence.words] def word_tokenize(text: str, remove_punctuation=False): text = remove_punct(text) if remove_punctuation else text doc = nlp_udpipe(text) return [word.text for word in doc]
def init_parser( parser: str = "spacy", model_or_lang: str = "en", *, is_tokenized: bool = False, disable_sbd: bool = False, parser_opts: Optional[Dict] = None, **kwargs, ) -> Language: """Initialise a spacy-wrapped parser given a language or model and some options. :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are 'spacy', 'stanfordnlp', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be installed, e.g. spacy-stanza. Defaults to 'spacy' :param model_or_lang: language model to use (must be installed). Defaults to an English model :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). For stanza and stanfordnlp, this will also cause sentence segmentation *only* to be done by splitting on new lines. See the documentation: https://stanfordnlp.github.io/stanfordnlp/tokenize.html See the documentation: https://stanfordnlp.github.io/stanza/tokenize.html :param disable_sbd: disables spaCy automatic sentence boundary detection (only works for spaCy) :param parser_opts: will be passed to the core pipeline. For spacy and udpipe, it will be passed to their `.load()` initialisations, for stanfordnlp and stanza `pipeline_opts` is passed to to their `.Pipeline()` initialisations :param kwargs: options to be passed to the ConllFormatter initialisation :return: an initialised Language object; the parser """ parser_opts = {} if parser_opts is None else parser_opts if parser == "spacy": nlp = spacy.load(model_or_lang, **parser_opts) if is_tokenized: nlp.tokenizer = SpacyPretokenizedTokenizer(nlp.vocab) if disable_sbd: nlp.add_pipe(_prevent_sbd, name="prevent-sbd", before="parser") elif parser == "stanfordnlp": from spacy_stanfordnlp import StanfordNLPLanguage import stanfordnlp snlp = stanfordnlp.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized, **parser_opts) nlp = StanfordNLPLanguage(snlp) elif parser == "stanza": import stanza from spacy_stanza import StanzaLanguage snlp = stanza.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized, **parser_opts) nlp = StanzaLanguage(snlp) elif parser == "udpipe": import spacy_udpipe nlp = spacy_udpipe.load(model_or_lang, **parser_opts) else: raise ValueError( "Unexpected value for 'parser'. Options are: 'spacy', 'stanfordnlp', 'stanza', 'udpipe'" ) conllformatter = ConllFormatter(nlp, **kwargs) nlp.add_pipe(conllformatter, last=True) return nlp
znaki=[",","/",".","'",'"',"&","!","?",":",";","«","»"] for word in spl_text: for z in znaki: if z in word: word = word.replace(z, "") pre_text.append(word) print(pre_text) !pip install spacy-udpipe !pip install pymorphy2 import spacy_udpipe spacy_udpipe.download("ru") nlp = spacy_udpipe.load("ru") text = "на столе стоит протекшая банка" doc = nlp(text) for token in doc: print(token.text, token.lemma_, token.pos_, token.dep_) d = nlp(txt) yo_true = 0 yo_missed = 0 tags = {'ADJS': 'ADJ', 'ADJF': 'ADJ', 'PRTF': 'VERB', 'PRTS': 'VERB', 'NOUN': 'NOUN'} ts = [] for i in pre_text: if 'ё' in str(i):
@author: chenfish """ # MDD import spacy_udpipe from nltk.tokenize import word_tokenize #mockup, list of sents training_data = [ 'Churkin said, that the UN Security, Council meeting on Crimea was useful.' ] #load the UD model of English ud_model = spacy_udpipe.load("en") sent_pos = [] sent_ud = [] sent_head = [] # get pos and ud tag for line in training_data: #print(line) temp_pos = [] temp_ud = [] temp_head = [] #print(line) tag_sent = ud_model(line) for i, token in enumerate(tag_sent):
print("Stanza model initialization ends") print("SpaCy model initialization starts") spacy_en = spacy.load("en_core_web_sm") spacy_zh = spacy.load("zh_core_web_sm") spacy_es = spacy.load("es_core_news_sm") spacy_ja = spacy.load("ja_core_news_sm") spacy_de = spacy.load("de_core_news_sm") spacy_fr = spacy.load("fr_core_news_sm") spacy_it = spacy.load("it_core_news_sm") spacy_nl = spacy.load("nl_core_news_sm") spacy_pt = spacy.load("pt_core_news_sm") print("SpaCy model initialization ends") print("UDpipe model initialization starts") udpipe_en = spacy_udpipe.load("en") udpipe_zh = spacy_udpipe.load("zh") udpipe_es = spacy_udpipe.load("es") udpipe_ja = spacy_udpipe.load("ja") udpipe_de = spacy_udpipe.load("de") udpipe_fr = spacy_udpipe.load("fr") udpipe_it = spacy_udpipe.load("it") udpipe_nl = spacy_udpipe.load("nl") udpipe_pt = spacy_udpipe.load("pt") udpipe_ar = spacy_udpipe.load("ar") udpipe_ru = spacy_udpipe.load("ru") print("UDpipe model initialization ends") model_lang_map["spacy"] = { "eng": spacy_en, "cmn": spacy_zh,
def __init__(self, language='fi-tdt'): self.name = f'UDPipe-{language}' self.nlp = spacy_udpipe.load(language)
from spacy.tokenizer import Tokenizer from spacy.lang.en import English from tqdm import trange import spacy_udpipe import collections import json nlp = spacy_udpipe.load('en') # tokenizer = Tokenizer(nlp.vocab) def load_json(js_path,nlp,output): with open(js_path) as fi : with open(output,'w') as fo: line = fi.readline().strip() while line: mrp = json.loads(line) if 'input' in mrp: tokens, lemmas, pos = [], [], [] for t in nlp(mrp['input']): tokens.append(t.text) lemmas.append(t.lemma_) pos.append(t.pos_) companions = [] # length = 0 init = 0 for idx,token in enumerate(tokens): begin = mrp['input'][init:].find(token) # print(mrp['input'][init+begin:init+begin+len(token)])
def __init__(self, language='ru', add_postags=True, vocabulary=None): self.pipeline = spacy_udpipe.load(language) self.add_postags = add_postags self.vocabulary = vocabulary
def init_parser( model_or_lang: str, parser: str, *, is_tokenized: bool = False, disable_sbd: bool = False, exclude_spacy_components: Optional[List[str]] = None, parser_opts: Optional[Dict] = None, **kwargs, ) -> Language: """Initialise a spacy-wrapped parser given a language or model and some options. :param model_or_lang: language model to use (must be installed for spaCy but will be automatically downloaded for stanza and UDPipe) :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are 'spacy', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be installed, e.g. spacy-stanza :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). When using 'spacy', this option also disabled sentence segmentation completely. For stanza, sentence segmentation will *only* be done by splitting on new lines. See the stanza documentation for more: https://stanfordnlp.github.io/stanza/tokenize.html#start-with-pretokenized-text This option does not affect UDPipe. :param disable_sbd: disables automatic sentence boundary detection in spaCy and stanza. For stanza, make sure that your input is in the correct format, that is: sentences must be separated by two new lines. If you want to disable both tokenization and sentence segmentation in stanza, do not enable this option but instead only use `is_tokenized` and make sure your sentences are separated by only one new line. See the stanza documentation for more: https://stanfordnlp.github.io/stanza/tokenize.html#tokenization-without-sentence-segmentation This option does not affect UDPipe. :param exclude_spacy_components: spaCy components to exclude from the pipeline, which can greatly improve processing speed. Only works when using spaCy as a parser. :param parser_opts: will be passed to the core pipeline. For spacy, it will be passed to its `.load()` initialisations, for stanza `pipeline_opts` is passed to its `.load_pipeline()` initialisations. UDPipe does not have any keyword arguments :param kwargs: options to be passed to the ConllFormatter initialisation :return: an initialised Language object; the parser """ parser_opts = {} if parser_opts is None else parser_opts if parser == "spacy": exclude = ["senter", "sentencizer" ] if disable_sbd or is_tokenized else [] exclude = exclude + exclude_spacy_components if exclude_spacy_components is not None else exclude nlp = spacy.load(model_or_lang, exclude=exclude, **parser_opts) if is_tokenized: nlp.tokenizer = SpacyPretokenizedTokenizer(nlp.vocab) if disable_sbd or is_tokenized: try: nlp.add_pipe("disable_sbd", before="parser") except ValueError: nlp.add_pipe("disable_sbd", first=True) elif parser == "stanza": import spacy_stanza # noqa: F811 import stanza verbose = parser_opts.pop("verbose", False) stanza.download(model_or_lang, verbose=verbose) nlp = spacy_stanza.load_pipeline( model_or_lang, verbose=verbose, tokenize_no_ssplit=disable_sbd, tokenize_pretokenized=is_tokenized, **parser_opts, ) elif parser == "udpipe": import spacy_udpipe # noqa: F811 spacy_udpipe.download(model_or_lang) nlp = spacy_udpipe.load(model_or_lang) else: raise ValueError( "Unexpected value for 'parser'. Options are: 'spacy', 'stanza', 'udpipe'" ) nlp.add_pipe("conll_formatter", config=kwargs, last=True) return nlp
def _get_model(self, iso): import spacy_udpipe nlp = spacy_udpipe.load(iso) return nlp