Beispiel #1
0
 def setUpClass(self):
     self.nlp = spacy.load('de')
     iwnlp = spaCyIWNLP(
         lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json',
         use_plain_lemmatization=True,
         ignore_case=True)
     self.nlp.add_pipe(iwnlp)
Beispiel #2
0
def spacyTest(sentence):
    import webbrowser, os
    import spacy
    import pandas as pd
    from spacy_iwnlp import spaCyIWNLP
    nlp = spacy.load('de')
    iwnlp = spaCyIWNLP(lemmatizer_path='.lib/IWNLP.Lemmatizer_20170501.json')
    nlp.add_pipe(iwnlp)
    doc = nlp(sentence)

    nlptags = ['LEMMA', 'POS', 'TAG', 'DEP']
    words = [token.text for token in doc]
    results = [[token._.iwnlp_lemmas, token.pos_, token.tag_, token.dep_]
               for token in doc]

    df = pd.DataFrame(results, index=words, columns=nlptags)
    html = df.to_html()
    filename = "spacy_dep_parse.html"
    with open(filename, 'w') as f:
        f.write("<!DOCTYPE html>\n")
        f.write("<html>\n")
        f.write("<head>\n")
        f.write("<title>Spacy Dependency Parse</title>\n")
        f.write("</head>\n")
        f.write("<body>\n")
        f.write(html)
        f.write("\n")
        f.write("</body>\n")
        f.write("</html>\n")
    webbrowser.open('file://' + os.path.realpath(filename))
 def __init__(self):
     self.logger = logging.getLogger()
     self.logger.setLevel(logging.DEBUG)
     self.logger.debug('Loading Spacy model')
     self.nlp = spacy.load('de')
     self.nlp.add_pipe(
         spaCyIWNLP(
             lemmatizer_path='data/IWNLP/IWNLP.Lemmatizer_20170501.json'))
     self.nlp.add_pipe(spaCySentiWS(sentiws_path='data/sentiws/'))
     self.logger.debug('Spacy loaded')
Beispiel #4
0
    def __init__(self, language):
        if(language != "en" and language != "de"):
            raise ValueError("Language not supported")
        else:
            self.language = language

        config = configparser.ConfigParser()
        config.read("config.ini")
        self.threads = int(config.get("analysis", "threads"))

        if self.language == "en":
            # pip install https://blackstone-model.s3-eu-west-1.amazonaws.com/en_blackstone_proto-0.0.1.tar.gz
            # Use Blackstone model which has been trained on english legal texts (https://github.com/ICLRandD/Blackstone)
            self.nlp = textacy.load_spacy_lang("en_blackstone_proto", disable=("textcat"))
            if not ("sentence_segmenter" or "CompoundCases") in self.nlp.pipe_names:
                # Use a custom sentence segmenter for better tokenization
                sentence_segmenter = SentenceSegmenter(self.nlp.vocab, CONCEPT_PATTERNS)
                self.nlp.add_pipe(sentence_segmenter, before="parser")
                # https://github.com/ICLRandD/Blackstone#compound-case-reference-detection
                compound_pipe = CompoundCases(self.nlp)
                self.nlp.add_pipe(compound_pipe)
            else:
                print("Please only instantiate this class only once per language.")
            stanza.download("en", processors="tokenize, sentiment", logging_level="WARN")
            self.stanza_nlp = stanza.Pipeline(lang="en", processors="tokenize, sentiment",
                                              tokenize_pretokenized=True, logging_level="WARN")
        else:
            # python -m spacy download de_core_news_md
            self.nlp = textacy.load_spacy_lang("de_core_news_md", disable=("textcat"))
            # Textacy caches loaded pipeline components. So do not add them again if they are already present.
            if not ("sentence_segmenter" or "spacyiwnlp") in self.nlp.pipe_names:
                iwnlp = spaCyIWNLP(lemmatizer_path='data/IWNLP.Lemmatizer_20181001.json', ignore_case=True)
                self.nlp.add_pipe(iwnlp)
                sentence_segmenter = SentenceSegmenter(self.nlp.vocab, CONCEPT_PATTERNS)
                self.nlp.add_pipe(sentence_segmenter, before="parser")
            else:
                print("Please only instantiate this class only once per language.")
            stanza.download("de", processors="tokenize, sentiment", logging_level="WARN")
            self.stanza_nlp = stanza.Pipeline(lang="de", processors="tokenize, sentiment",
                                              tokenize_pretokenized=True, logging_level="WARN")

        self.corpus = None
import sys
from itertools import zip_longest
from pathlib import Path
from http.server import *
from socketserver import *
from subprocess import *
import urllib.parse
import json

import spacy
from germalemma import GermaLemma
from spacy_iwnlp import spaCyIWNLP

# setup IWNLP
nlp = spacy.load("de", disable=["parser", "ner"])
iwnlp = spaCyIWNLP(lemmatizer_path="IWNLP.Lemmatizer_20181001.json")
nlp.add_pipe(iwnlp)

# setup GermaLemma
lemmatizer = GermaLemma()


def escape_text(text):
    return text.replace("\n", "\\n")


def unescape_text(text):
    return text.replace("\\n", "\n")


def replace_with_lemma(token_text, iwnlp_lemmas, pos):
Beispiel #6
0
import pandas as pd
from string import punctuation
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('german')
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
import spacy
from spacy_iwnlp import spaCyIWNLP
nlp = spacy.load('de')
iwnlp = spaCyIWNLP(lemmatizer_path='data/IWNLP.Lemmatizer_20181001.json')
nlp.add_pipe(iwnlp)
stemmer = SnowballStemmer('german')


def strip_punct(series):
    '''Strip puncation of series of strings
    Arguments: series - a series containing strings
    Return: new-series - a series of strings without punctuation'''
    new_series = series.str.replace(r'[^\w\s]+', '', regex=True)
    return new_series


def strip_stopwords(series, stopwords=stopwords):
    '''Strip stopwords of series of strings
    Arguments: series - a series containing strings, stopwords - a list of stopwords (default: german)
    Return: new-series - a series of strings without stopwords'''
    series = series.copy()
Beispiel #7
0
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('german')
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
import spacy
from spacy_iwnlp import spaCyIWNLP
nlp = spacy.load(
    'de'
)  #You need to download the 'de'-module in advance. This will be done automatically if you run `make setup` via the Makefile found in the main folder of the repo.
from pathlib import Path
path_here = Path(__file__).resolve().parent
iwnlp = spaCyIWNLP(lemmatizer_path=path_here /
                   "../data/IWNLP.Lemmatizer_20181001.json")
nlp.add_pipe(iwnlp)
stemmer = SnowballStemmer('german')

from utils import cleaning


def strip_stopwords(series, stopwords=stopwords):
    '''Strip stopwords of series of strings
    Arguments: series - a series containing strings, stopwords - a list of stopwords (default: german)
    Return: new-series - a series of strings without stopwords'''
    series = series.copy()
    new_series = series.apply(lambda x: " ".join([
        word.lower() for word in x.split() if word.lower() not in (stopwords)
    ]) if x is not None else x)
    return new_series
 def setUpClass(self):
     self.nlp = spacy.load('de')
     iwnlp = spaCyIWNLP(lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json')
     self.nlp.add_pipe(iwnlp)
import spacy
from spacy_iwnlp import spaCyIWNLP

nlp = spacy.load('de')
iwnlp = spaCyIWNLP(lemmatizer_path='case_dict/IWNLP.Lemmatizer_20181001.json')
nlp.add_pipe(iwnlp)

#doc = nlp("Wir mögen jene Fußballspiele mit jenen Verlängerungen, welche bei diesem Wetter stattfinden.")

#for token in doc:
#    print('POS: {}\tIWNLP:{}'.format(token.pos_, token._.iwnlp_lemmas))


def lemmatize_adjective(token):
    lem = token._.iwnlp_lemmas
    #print("token:", token, "spacy: ", token.lemma_, "IWNLP: ", lem)
    #if token.text.endswith('ete') or token.text.endswith('eter') or token.text.endswith('eten') or token.text.endswith('etem') or token.text.endswith('etes'):
        #print("?????", token)

    if lem:
        lemmatized_adjective = lem[0]
    else:
        lemmatized_adjective = token.lemma_ #fallback-strategie: use spacy-lemmatizer

    if lemmatized_adjective.endswith('e'): #TODO is this necessary?
        lemmatized_adjective = lemmatized_adjective[:-1]

    if lemmatized_adjective.endswith('en'): # prevent sth like verbündeter -> verbündener
        #print('###########', lemmatized_adjective)
        if not lemmatized_adjective in token.text:
            lemmatized_adjective = lemmatized_adjective[:-1] + 't'