Python MultiLanguage Beispiele

Programmiersprache: Python

Namespace / Paketname: spacy.lang.xx

Klasse / Typ: MultiLanguage

Beispiele auf hotexamples.com: 8

Python MultiLanguage - 8 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die spacy.lang.xx.MultiLanguage, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

MultiLanguage(8)

add_pipe(1)

create_pipe(1)

make_doc(1)

Beispiel #1

Datei anzeigen

Datei: test_doc_api.py Projekt: paolodedios/spaCy

def test_doc_noun_chunks_not_implemented():
    """Test that a language without noun_chunk iterator, throws a NotImplementedError"""
    text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat."
    nlp = MultiLanguage()
    doc = nlp(text)
    with pytest.raises(NotImplementedError):
        _ = list(doc.noun_chunks)  # noqa: F841

Beispiel #2

Datei anzeigen

def getModel(language: str,
             language_models: dict = LANGUAGE_MODELS) -> spacy.lang:

    if language not in language_models.keys():
        print("language not supported. Running on MultiLanguage.")
        return MultiLanguage()

    return spacy.load(language_models[language], disable=["parser"])

Beispiel #3

Datei anzeigen

Datei: predict.py Projekt: TharinduDR/MUDES

def predict_spans(model, text: str, language: str = "en"):
    if language == "en":
        nlp = English()

    elif language == "el":
        nlp = Greek()

    elif language == "da":
        nlp = Danish()

    elif language == "ar":
        nlp = Arabic()

    else:
        nlp = MultiLanguage()

    tokenizer = nlp.tokenizer
    tokens = tokenizer(text)
    sentences = []
    tokenised_text = []
    cleaned_tokens = []
    cleaned_index = 0
    for token in tokens:
        if not token.text.isspace():
            tokenised_text.append(token.text)
            indexed_token = IndexedToken(token, cleaned_index)
            cleaned_tokens.append(indexed_token)
            cleaned_index += 1
        else:
            indexed_token = IndexedToken(token, token.i)
            cleaned_tokens.append(indexed_token)

    sentences.append(tokenised_text)

    predictions, raw_outputs = model.predict(sentences, split_on_space=False)
    span_predictions = []
    sentence_prediction = predictions[0]

    for cleaned_token in cleaned_tokens:

        if cleaned_token.clean_index >= len(sentence_prediction):
            break

        if cleaned_token.token.text.isspace():
            continue

        word_prediction = sentence_prediction[cleaned_token.clean_index]
        toxicness = word_prediction[cleaned_token.token.text]
        if toxicness == "TOXIC":
            location = cleaned_token.token.idx
            if len(span_predictions) > 0:
                last_index = span_predictions[-1]
                if location == last_index + 2:
                    span_predictions.append(location - 1)
            length = len(cleaned_token.token.text)
            for i in range(length):
                span_predictions.append(location + i)
    return span_predictions

Beispiel #4

Datei anzeigen

def spacy_tokenize(sents: List[str], lang: str) -> List[List[str]]:
    import spacy
    try:
        nlp = spacy.load(lang.split('-')[0])
    except OSError:
        try:
            cls = spacy.util.get_lang_class(lang.split('-')[0])
            nlp = cls()
        except ImportError:
            utils.Logging.warn(f"spaCy does not support language \"{lang}\", falling back to default model")
            from spacy.lang.xx import MultiLanguage
            nlp = MultiLanguage()
        # tokenizing may require additional dependencies
    nlp('a')  # just run it first time otherwise it sometimes crashes for no reason
    tok_sents = []
    for sent in sents:
        sent = sent.strip()
        tokens = [token.text for token in nlp.make_doc(sent)] if sent != '' else []
        tok_sents.append(tokens)
    return tok_sents

Beispiel #5

Datei anzeigen

    def predict_tokens(self, text: str, language: str = "en"):
        toxic_spans = contiguous_ranges(predict_spans(self.model, text))

        if language == "en":
            nlp = English()

        else:
            nlp = MultiLanguage()

        tokenizer = nlp.Defaults.create_tokenizer(nlp)
        tokens = tokenizer(text)
        output_tokens = []
        for token in tokens:
            is_toxic = False
            for toxic_span in toxic_spans:
                if toxic_span[0] <= token.idx <= toxic_span[1]:
                    is_toxic = True
                    break

            predicted_token = PredictedToken(token.text, is_toxic)
            output_tokens.append(predicted_token)

        return output_tokens

Beispiel #6

Datei anzeigen

from collections import defaultdict, Counter
import pymorphy2
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score
from spacy.lang.xx import MultiLanguage
import csv
import pandas as pd
from sklearn.model_selection import train_test_split

nlp = MultiLanguage()
morph = pymorphy2.MorphAnalyzer()
tokenizer = MultiLanguage().Defaults.create_tokenizer(nlp)
stop_words = set(stopwords.words('russian'))
POSITIVE = "Переваги:\xa0"
NEGATIVE = "Недоліки:\xa0"
FEEDBACK_FILE = "scrape_feedbacks.csv"
vocabulary_pr = {}


def get_data(url, counter, cls):
    url = url.format(counter)
    resp = requests.get(url)
    page = re.compile("page=(\d+)/").search(resp.url)
    if page:
        curr_page = int(page.groups()[0])

Beispiel #7

Datei anzeigen

Datei: SpaCy_sucks.py Projekt: wangdi917/Testing

# python -c "import spacy; print (spacy.__version__)"
# python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))"
# python -c "import os; import en_core_web_sm; print(os.path.dirname(en_core_web_sm.__file__))"

import os
import spacy
NLP = spacy.load('en')  # load model with shortcut link "en"
NLP = spacy.load('en_core_web_sm')  # load model package "en_core_web_sm"
# NLP = spacy.load('/home/wangdi498/testing/en_core_web_sm')	# load package from a directory

import en_core_web_sm
NLP = en_core_web_sm.load()

# standard import:
from spacy.lang.xx import MultiLanguage
nlp = MultiLanguage()

# lazy import:
from spacy.util import get_lang_class
nlp = get_lang_class('xx')


def Tokenization():
    print("\nThe outcomes of Tokenization are:")
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(u"Apple isn't looking at buying U.S.A. startup for $1 billion.")
    for token in doc:
        print('\t', token.text)


def Tagging():

Beispiel #8

Datei anzeigen

def split_spacy_m(text):
    nlp_m = MultiLanguage()
    nlp_m.add_pipe(nlp_m.create_pipe('sentencizer'))
    return prepare_spacy(text, nlp_m)