Beispiel #1
0
def preprocess(text, NUM_DOCS, num_preprocessed, stemming):
    global i
    if i == 0:
        i = num_preprocessed
    i += 1
    result = []
    stemmer = ItalianStemmer()
    if i % 20 == 0:
        print(f"\t{i} out of {NUM_DOCS+num_preprocessed} documents preprocessed")
    nlp = Italian()
    t0 = text.split("Lingua processuale")[0].split("Sentenza")[-1]
    t1 = "".join(t0)
    t1 =  re.sub(r"’|'|«|»|\d{1,4}\/\d{1,4}\/(cee|ce)|\d+|---\|*|^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$", " ", t1, flags=re.IGNORECASE)
    # print(t1)
    doc = nlp(t1)
    for token in doc:
        if token.text.lower() not in it_stopwords and not token.is_punct | token.is_space and len(token) > 3:
            assert token.lang_ == "it"
            if stemming:
                result.append(stemmer.stem(word=token.text))
            else:
                result.append(token.lemma_.lower())
            if "'" in result[-1] or "’" in result[-1]:
                raise Exception(f"Detected_ {token.lemma_}")
    return result
Beispiel #2
0
    def init_resources(self):
        self.punctuation_pattern = re.compile("|".join(PUNCTUATION))
        self.stemmer = None
        stopwords_path = os.path.join(
            os.path.dirname(assistant_dialog_skill_analysis.__file__),
            "resources",
            self.language_code,
            "stopwords",
        )
        if self.language_code == "en":
            from spacy.lang.en import English

            self.tokenizer = Tokenizer(English().vocab)
            self.stemmer = SnowballStemmer(language="english")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "fr":
            from spacy.lang.fr import French

            self.tokenizer = Tokenizer(French().vocab)
            self.stemmer = SnowballStemmer(language="french")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "de":
            from spacy.lang.de import German

            self.tokenizer = Tokenizer(German().vocab)
            self.stemmer = SnowballStemmer(language="german")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "it":
            from spacy.lang.it import Italian

            self.tokenizer = Tokenizer(Italian().vocab)
            self.stemmer = SnowballStemmer(language="italian")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "cs":
            from spacy.lang.cs import Czech

            self.tokenizer = Tokenizer(Czech().vocab)
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "pt":
            from spacy.lang.pt import Portuguese

            self.tokenizer = Tokenizer(Portuguese().vocab)
            self.stemmer = SnowballStemmer(language="portuguese")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "es":
            from spacy.lang.es import Spanish

            self.tokenizer = Tokenizer(Spanish().vocab)
            self.stemmer = SnowballStemmer(language="spanish")
            self.stop_words = self.load_stop_words(stopwords_path)
        else:
            raise Exception("language code %s is not supported",
                            self.language_code)
Beispiel #3
0
 def get_tokenizers(self, lang):
     os.environ['TOKENIZERS_PARALLELISM'] = "True"
     if lang == 'de':
         spacy = German()
         bert = "deepset/gbert-base"
     elif lang == 'fr':
         spacy = French()
         bert = "camembert/camembert-base-ccnet"
     elif lang == 'it':
         spacy = Italian()
         bert = "dbmdz/bert-base-italian-cased"
     else:
         raise ValueError(
             f"Please choose one of the following languages: {self.languages}"
         )
     return spacy.tokenizer, AutoTokenizer.from_pretrained(bert)
def lang_change(language):
    if language == 'en':
        from spacy.lang.en import English
        from spacy.lang.en.stop_words import STOP_WORDS
        parser = English()
        file = "\config_files\config_spacy_en.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'de':
        from spacy.lang.de import German
        from spacy.lang.de.stop_words import STOP_WORDS
        parser = German()
        file = "\config_files\config_spacy_de.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'es':
        from spacy.lang.es import Spanish
        from spacy.lang.es.stop_words import STOP_WORDS
        parser = Spanish()
        file = "\config_files\config_spacy_es.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'pt':
        from spacy.lang.pt import Portuguese
        from spacy.lang.pt.stop_words import STOP_WORDS
        parser = Portuguese()
        file = "\config_files\config_spacy_pt.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'fr':
        from spacy.lang.fr import French
        from spacy.lang.fr.stop_words import STOP_WORDS
        parser = French()
        file = "\config_files\config_spacy_fr.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'it':
        from spacy.lang.it import Italian
        from spacy.lang.it.stop_words import STOP_WORDS
        parser = Italian()
        file = "\config_files\config_spacy_it.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'nl':
        from spacy.lang.nl import Dutch
        from spacy.lang.nl.stop_words import STOP_WORDS
        parser = Dutch()
        file = "\config_files\config_spacy_nl.yaml"
        configfile_path = os.getcwd() + file

    return parser, STOP_WORDS, configfile_path
Beispiel #5
0
        text = word_tokenize(open("data/converted/" + doc,
                                  "r",
                                  encoding='utf-8').read(),
                             language='italian')
        average += len(text)

print(average / i, i)

av1 = 0
av2 = 0
i = 0
for dic in os.listdir("data/.preprocessed"):
    year = pickle.load(open("data/.preprocessed/" + dic, "rb"))
    i += len(year)
    print(len(year))
    for doc in year:
        if dic[0] == 's':
            av1 += len(doc)
        else:
            av2 += len(doc)

print(av1 / i, av2 / i, i)

nlp = Italian()

doc = nlp(open("data/converted/61999CJ0001.txt", "r", encoding='utf-8').read())
tok = []
for t in doc:
    tok.append(t.lemma_.lower())

dictionary = Dictionary([tok])
Beispiel #6
0
def test_issue2179():
    """Test that spurious 'extra_labels' aren't created when initializing NER."""
    nlp = Italian()
    ner = nlp.create_pipe("ner")
    ner.add_label("CITIZENSHIP")
    nlp.add_pipe(ner)
    nlp.begin_training()
    nlp2 = Italian()
    nlp2.add_pipe(nlp2.create_pipe("ner"))
    nlp2.from_bytes(nlp.to_bytes())
    assert "extra_labels" not in nlp2.get_pipe("ner").cfg
    assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP", )
Beispiel #7
0
def test_issue2482():
    """Test we can serialize and deserialize a blank NER or parser model."""
    nlp = Italian()
    nlp.add_pipe(nlp.create_pipe("ner"))
    b = nlp.to_bytes()
    Italian().from_bytes(b)
Beispiel #8
0
from datasketch import MinHash, MinHashLSH

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from spacy.lang.en import English
from spacy.lang.it import Italian

import whoosh.index as index
from tqdm import tqdm
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser, OrGroup

from semantic_sim import SimServer

tokenize_it = Italian().Defaults.create_tokenizer()
tokenize_en = English().Defaults.create_tokenizer()
wnl = WordNetLemmatizer()
punct = string.punctuation.replace('.', '').replace(',', '')


def to_shingles(doc, k=5):
    shingles = set()
    doc_string = doc.lower()
    if len(doc_string) <= k:
        doc_string = doc + 'no_txt_' + str(xxhash.xxh64(str(random.random())).hexdigest())
    for i in range(len(doc_string) - k + 1):
        h = doc_string[i:i+k]
        shingles.add(h.encode('utf8'))
    return list(shingles)
Beispiel #9
0
import sys
import string
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from spacy.lang.it.stop_words import STOP_WORDS
from spacy.lang.it import Italian
from sklearn import svm
from sklearn import neural_network
from sklearn import metrics

punctuations = string.punctuation
nlp = spacy.load("it_core_news_sm")
stop_words = spacy.lang.it.stop_words.STOP_WORDS
parser = Italian()


# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

def test_issue2179():
    """Test that spurious 'extra_labels' aren't created when initializing NER."""
    nlp = Italian()
    ner = nlp.add_pipe("ner")
    ner.add_label("CITIZENSHIP")
    nlp.initialize()
    nlp2 = Italian()
    nlp2.add_pipe("ner")
    assert len(nlp2.get_pipe("ner").labels) == 0
    model = nlp2.get_pipe("ner").model
    model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves)
    nlp2.from_bytes(nlp.to_bytes())
    assert "extra_labels" not in nlp2.get_pipe("ner").cfg
    assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP", )
Beispiel #11
0
        elif lang == "nl":
            return set(
                get_stop_words("nl") + stopwords.words('dutch') + STOP_LIST_NL)
    except:
        print("warning: no stopwords were downloaded. check nltk corpora")
        print(format_exc())
        return set()


# load resources
_stop_words = load_stoplist()
print("Loading spacy model...")
_spacy = English()
_spacy_fr = French()
_spacy_nl = Dutch()
_spacy_it = Italian()


def get_stoplist():
    return _stop_words


def lemmatize(text, lowercase=True, lang="en"):
    """ Return lemmatized text """

    if lang == "en":
        tokens = _spacy(text)
    elif lang == "fr":
        tokens = _spacy_fr(text)
    elif lang == "nl":
        tokens = _spacy_nl(text)
Beispiel #12
0
from spacy.lang.ru import Russian
from spacy.lang.zh import Chinese
from spacy.lang.ja import Japanese
from spacy.lang.ca import Catalan
from spacy.lang.eu import Basque

from DataHandler import load_df_twitter_sent, load_df_lorelei
from util import clean_str as test_clean_str
from nltk.corpus import stopwords
from util import identity_fn, lang2id

language_dict = {
    'english': English(),
    'spanish': Spanish(),
    'french': French(),
    'italian': Italian(),
    'german': German(),
    'russian': Russian(),
    'chinese': Chinese(),
    'japanese': Japanese(),
    'catalan': Catalan(),
    'basque': Basque(),
}


class Tokenizer:
    def __init__(self,
                 language,
                 tokenizer_method='spacy',
                 remove_stopwords=True,
                 lowercase=True,
Beispiel #13
0
def test_issue2179():
    """Test that spurious 'extra_labels' aren't created when initializing NER."""
    nlp = Italian()
    ner = nlp.create_pipe("ner")
    ner.add_label("CITIZENSHIP")
    nlp.add_pipe(ner)
    nlp.begin_training()
    nlp2 = Italian()
    nlp2.add_pipe(nlp2.create_pipe("ner"))
    nlp2.from_bytes(nlp.to_bytes())
    assert "extra_labels" not in nlp2.get_pipe("ner").cfg
    assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
Beispiel #14
0
def test_issue2482():
    """Test we can serialize and deserialize a blank NER or parser model."""
    nlp = Italian()
    nlp.add_pipe(nlp.create_pipe("ner"))
    b = nlp.to_bytes()
    Italian().from_bytes(b)