Beispiel #1
0
def test_doc_noun_chunks_not_implemented():
    """Test that a language without noun_chunk iterator, throws a NotImplementedError"""
    text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat."
    nlp = MultiLanguage()
    doc = nlp(text)
    with pytest.raises(NotImplementedError):
        _ = list(doc.noun_chunks)  # noqa: F841
Beispiel #2
0
def getModel(language: str,
             language_models: dict = LANGUAGE_MODELS) -> spacy.lang:

    if language not in language_models.keys():
        print("language not supported. Running on MultiLanguage.")
        return MultiLanguage()

    return spacy.load(language_models[language], disable=["parser"])
Beispiel #3
0
def predict_spans(model, text: str, language: str = "en"):
    if language == "en":
        nlp = English()

    elif language == "el":
        nlp = Greek()

    elif language == "da":
        nlp = Danish()

    elif language == "ar":
        nlp = Arabic()

    else:
        nlp = MultiLanguage()

    tokenizer = nlp.tokenizer
    tokens = tokenizer(text)
    sentences = []
    tokenised_text = []
    cleaned_tokens = []
    cleaned_index = 0
    for token in tokens:
        if not token.text.isspace():
            tokenised_text.append(token.text)
            indexed_token = IndexedToken(token, cleaned_index)
            cleaned_tokens.append(indexed_token)
            cleaned_index += 1
        else:
            indexed_token = IndexedToken(token, token.i)
            cleaned_tokens.append(indexed_token)

    sentences.append(tokenised_text)

    predictions, raw_outputs = model.predict(sentences, split_on_space=False)
    span_predictions = []
    sentence_prediction = predictions[0]

    for cleaned_token in cleaned_tokens:

        if cleaned_token.clean_index >= len(sentence_prediction):
            break

        if cleaned_token.token.text.isspace():
            continue

        word_prediction = sentence_prediction[cleaned_token.clean_index]
        toxicness = word_prediction[cleaned_token.token.text]
        if toxicness == "TOXIC":
            location = cleaned_token.token.idx
            if len(span_predictions) > 0:
                last_index = span_predictions[-1]
                if location == last_index + 2:
                    span_predictions.append(location - 1)
            length = len(cleaned_token.token.text)
            for i in range(length):
                span_predictions.append(location + i)
    return span_predictions
Beispiel #4
0
def spacy_tokenize(sents: List[str], lang: str) -> List[List[str]]:
    import spacy
    try:
        nlp = spacy.load(lang.split('-')[0])
    except OSError:
        try:
            cls = spacy.util.get_lang_class(lang.split('-')[0])
            nlp = cls()
        except ImportError:
            utils.Logging.warn(f"spaCy does not support language \"{lang}\", falling back to default model")
            from spacy.lang.xx import MultiLanguage
            nlp = MultiLanguage()
        # tokenizing may require additional dependencies
    nlp('a')  # just run it first time otherwise it sometimes crashes for no reason
    tok_sents = []
    for sent in sents:
        sent = sent.strip()
        tokens = [token.text for token in nlp.make_doc(sent)] if sent != '' else []
        tok_sents.append(tokens)
    return tok_sents
Beispiel #5
0
    def predict_tokens(self, text: str, language: str = "en"):
        toxic_spans = contiguous_ranges(predict_spans(self.model, text))

        if language == "en":
            nlp = English()

        else:
            nlp = MultiLanguage()

        tokenizer = nlp.Defaults.create_tokenizer(nlp)
        tokens = tokenizer(text)
        output_tokens = []
        for token in tokens:
            is_toxic = False
            for toxic_span in toxic_spans:
                if toxic_span[0] <= token.idx <= toxic_span[1]:
                    is_toxic = True
                    break

            predicted_token = PredictedToken(token.text, is_toxic)
            output_tokens.append(predicted_token)

        return output_tokens
Beispiel #6
0
from collections import defaultdict, Counter
import pymorphy2
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score
from spacy.lang.xx import MultiLanguage
import csv
import pandas as pd
from sklearn.model_selection import train_test_split

nlp = MultiLanguage()
morph = pymorphy2.MorphAnalyzer()
tokenizer = MultiLanguage().Defaults.create_tokenizer(nlp)
stop_words = set(stopwords.words('russian'))
POSITIVE = "Переваги:\xa0"
NEGATIVE = "Недоліки:\xa0"
FEEDBACK_FILE = "scrape_feedbacks.csv"
vocabulary_pr = {}


def get_data(url, counter, cls):
    url = url.format(counter)
    resp = requests.get(url)
    page = re.compile("page=(\d+)/").search(resp.url)
    if page:
        curr_page = int(page.groups()[0])
Beispiel #7
0
# python -c "import spacy; print (spacy.__version__)"
# python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))"
# python -c "import os; import en_core_web_sm; print(os.path.dirname(en_core_web_sm.__file__))"

import os
import spacy
NLP = spacy.load('en')  # load model with shortcut link "en"
NLP = spacy.load('en_core_web_sm')  # load model package "en_core_web_sm"
# NLP = spacy.load('/home/wangdi498/testing/en_core_web_sm')	# load package from a directory

import en_core_web_sm
NLP = en_core_web_sm.load()

# standard import:
from spacy.lang.xx import MultiLanguage
nlp = MultiLanguage()

# lazy import:
from spacy.util import get_lang_class
nlp = get_lang_class('xx')


def Tokenization():
    print("\nThe outcomes of Tokenization are:")
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(u"Apple isn't looking at buying U.S.A. startup for $1 billion.")
    for token in doc:
        print('\t', token.text)


def Tagging():
Beispiel #8
0
def split_spacy_m(text):
    nlp_m = MultiLanguage()
    nlp_m.add_pipe(nlp_m.create_pipe('sentencizer'))
    return prepare_spacy(text, nlp_m)