def test_doc_noun_chunks_not_implemented(): """Test that a language without noun_chunk iterator, throws a NotImplementedError""" text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat." nlp = MultiLanguage() doc = nlp(text) with pytest.raises(NotImplementedError): _ = list(doc.noun_chunks) # noqa: F841
def getModel(language: str, language_models: dict = LANGUAGE_MODELS) -> spacy.lang: if language not in language_models.keys(): print("language not supported. Running on MultiLanguage.") return MultiLanguage() return spacy.load(language_models[language], disable=["parser"])
def predict_spans(model, text: str, language: str = "en"): if language == "en": nlp = English() elif language == "el": nlp = Greek() elif language == "da": nlp = Danish() elif language == "ar": nlp = Arabic() else: nlp = MultiLanguage() tokenizer = nlp.tokenizer tokens = tokenizer(text) sentences = [] tokenised_text = [] cleaned_tokens = [] cleaned_index = 0 for token in tokens: if not token.text.isspace(): tokenised_text.append(token.text) indexed_token = IndexedToken(token, cleaned_index) cleaned_tokens.append(indexed_token) cleaned_index += 1 else: indexed_token = IndexedToken(token, token.i) cleaned_tokens.append(indexed_token) sentences.append(tokenised_text) predictions, raw_outputs = model.predict(sentences, split_on_space=False) span_predictions = [] sentence_prediction = predictions[0] for cleaned_token in cleaned_tokens: if cleaned_token.clean_index >= len(sentence_prediction): break if cleaned_token.token.text.isspace(): continue word_prediction = sentence_prediction[cleaned_token.clean_index] toxicness = word_prediction[cleaned_token.token.text] if toxicness == "TOXIC": location = cleaned_token.token.idx if len(span_predictions) > 0: last_index = span_predictions[-1] if location == last_index + 2: span_predictions.append(location - 1) length = len(cleaned_token.token.text) for i in range(length): span_predictions.append(location + i) return span_predictions
def spacy_tokenize(sents: List[str], lang: str) -> List[List[str]]: import spacy try: nlp = spacy.load(lang.split('-')[0]) except OSError: try: cls = spacy.util.get_lang_class(lang.split('-')[0]) nlp = cls() except ImportError: utils.Logging.warn(f"spaCy does not support language \"{lang}\", falling back to default model") from spacy.lang.xx import MultiLanguage nlp = MultiLanguage() # tokenizing may require additional dependencies nlp('a') # just run it first time otherwise it sometimes crashes for no reason tok_sents = [] for sent in sents: sent = sent.strip() tokens = [token.text for token in nlp.make_doc(sent)] if sent != '' else [] tok_sents.append(tokens) return tok_sents
def predict_tokens(self, text: str, language: str = "en"): toxic_spans = contiguous_ranges(predict_spans(self.model, text)) if language == "en": nlp = English() else: nlp = MultiLanguage() tokenizer = nlp.Defaults.create_tokenizer(nlp) tokens = tokenizer(text) output_tokens = [] for token in tokens: is_toxic = False for toxic_span in toxic_spans: if toxic_span[0] <= token.idx <= toxic_span[1]: is_toxic = True break predicted_token = PredictedToken(token.text, is_toxic) output_tokens.append(predicted_token) return output_tokens
from collections import defaultdict, Counter import pymorphy2 import requests from bs4 import BeautifulSoup import re import numpy as np from nltk.corpus import stopwords from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import precision_score, recall_score, accuracy_score from spacy.lang.xx import MultiLanguage import csv import pandas as pd from sklearn.model_selection import train_test_split nlp = MultiLanguage() morph = pymorphy2.MorphAnalyzer() tokenizer = MultiLanguage().Defaults.create_tokenizer(nlp) stop_words = set(stopwords.words('russian')) POSITIVE = "Переваги:\xa0" NEGATIVE = "Недоліки:\xa0" FEEDBACK_FILE = "scrape_feedbacks.csv" vocabulary_pr = {} def get_data(url, counter, cls): url = url.format(counter) resp = requests.get(url) page = re.compile("page=(\d+)/").search(resp.url) if page: curr_page = int(page.groups()[0])
# python -c "import spacy; print (spacy.__version__)" # python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))" # python -c "import os; import en_core_web_sm; print(os.path.dirname(en_core_web_sm.__file__))" import os import spacy NLP = spacy.load('en') # load model with shortcut link "en" NLP = spacy.load('en_core_web_sm') # load model package "en_core_web_sm" # NLP = spacy.load('/home/wangdi498/testing/en_core_web_sm') # load package from a directory import en_core_web_sm NLP = en_core_web_sm.load() # standard import: from spacy.lang.xx import MultiLanguage nlp = MultiLanguage() # lazy import: from spacy.util import get_lang_class nlp = get_lang_class('xx') def Tokenization(): print("\nThe outcomes of Tokenization are:") nlp = spacy.load('en_core_web_sm') doc = nlp(u"Apple isn't looking at buying U.S.A. startup for $1 billion.") for token in doc: print('\t', token.text) def Tagging():
def split_spacy_m(text): nlp_m = MultiLanguage() nlp_m.add_pipe(nlp_m.create_pipe('sentencizer')) return prepare_spacy(text, nlp_m)