def tokenization(frase): nlp=it_core_news_sm.load() tok = list() row = list() stop_word = ['e','o','','il','lo','la','un','uno','una','mia','mio','tuo','con','su','per','tra','fra','please','aiuto','urgente','help',',','.','..','...','....',':',';','!','(',')','-',' ','/','"'] sentence = np.zeros(50) frase= frase.lower() print("domanda = ", frase) sentence = nlp(frase) vec =[] for tok in sentence: word = tok.text.lower() flag = True #delete stopWords if word in stop_word: flag = False #delete numbers elif tok.pos == NUM: flag = False elif tok.pos == VERB or tok.pos == AUX: word=tok.lemma_ if flag==True: vec.append(tok.text) return vec
def spacy_analyze(fulltext, source_lang): """Use spacy to analyze input text Parameters: fulltext (string): text source_lang (string): language of the input text Returns: nlp: nlp object """ doc = None if (source_lang == 'fr'): try: nlp = fr_core_news_sm.load(disable=['parser', 'ner']) doc = nlp(fulltext) except: print(sys.exc_info()[0]) elif (source_lang == 'it'): try: nlp = it_core_news_sm.load(disable=['parser', 'ner']) doc = nlp(fulltext) except: print(sys.exc_info()[0]) return doc
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required): '''returns the spacy nlp function corresponding to the language of a document''' if default_lingo in supported_languages: if bigmodel_required == False: if default_lingo == "German": import de_core_news_sm nlp = de_core_news_sm.load() elif default_lingo == "English": import en_core_web_sm nlp = en_core_web_sm.load() elif default_lingo == "Spanish": import es_core_news_sm nlp = es_core_news_sm.load() elif default_lingo == "French": import fr_core_news_sm nlp = fr_core_news_sm.load() elif default_lingo == "Portuguese": import pt_core_news_sm nlp = pt_core_news_sm.load() else: import it_core_news_sm nlp = it_core_news_sm.load() else: if default_lingo == "German": import de_core_news_md nlp = de_core_news_md.load() elif default_lingo == "English": import en_core_web_md nlp = en_core_web_md.load() elif default_lingo == "Spanish": import es_core_news_md nlp = es_core_news_md.load() elif default_lingo == "French": import fr_core_news_md nlp = fr_core_news_md.load() elif default_lingo == "Portuguese": # there is no pt_md model import pt_core_news_sm nlp = pt_core_news_sm.load() else: # there is no it_md model import it_core_news_sm nlp = it_core_news_sm.load() else: print("NOT A SUPPORTED LANGUAGE!") return nlp
def __init__(self): self.nlp = it_core_news_sm.load() emoji = Emoji(self.nlp) sentencizer = self.nlp.create_pipe("sentencizer") #Add components to the pipeline self.nlp.add_pipe(emoji, first=True) self.nlp.add_pipe(hashtag_pipe, first=True) self.nlp.add_pipe(sentencizer)
def get_dependency_tree(sentence, language="en"): if (language == "it"): nlp = it_core_news_sm.load() else: nlp = en_core_web_sm.load() doc = nlp(sentence) global root for token in doc: if token.dep_ == "ROOT": root = token result = create_dictionary(root) return result
def _nlp(spacy_module: str) -> Optional[NLP]: print("Loading spacy language model for '", spacy_module, "'") if spacy_module == 'en': nlp = en_core_web_sm.load() elif spacy_module == 'es': nlp = es_core_news_sm.load() elif spacy_module == 'de': nlp = de_core_news_sm.load() elif spacy_module == 'fr': nlp = fr_core_news_sm.load() elif spacy_module == 'it': nlp = it_core_news_sm.load() elif spacy_module == 'pt': nlp = pt_core_news_sm.load() else: raise ValueError(f'Unsupported language {spacy_module}') return nlp
def __init__(self, url): try: pattern = re.compile( "^(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$" ) if not pattern.match(url): print(f"{url} is not a valid url") self.url = url self.article = Article(self.url) self.article.download() self.article.parse() self.author = self.article.authors self.oneline = self.article.summary self.text = self.article.text.replace("\n", ".") if self.article.meta_lang == 'en' or (self.article.meta_lang == '' and url.find( "cnn.com", 0, 10)): import en_core_web_sm self.model = en_core_web_sm.load() elif self.article.meta_lang == 'it': import it_core_news_sm self.model = it_core_news_sm.load() elif self.article.meta_lang == 'fr': import fr_core_news_sm self.model = fr_core_news_sm.load() elif self.article.meta_lang == 'es': import es_core_news_sm self.model = es_core_news_sm.load() elif self.article.meta_lang == 'pt': import pt_core_news_sm self.model = pt_core_news_sm.load() else: print( f"The {self.article.meta_lang} language is not supported") self.data = [] self.vectorizer = TfidfVectorizer(strip_accents='unicode') except article.ArticleException: print( f"The url {url} is not supported, please write to [email protected] for further help" ) self.valid = False
def check_spacy_models(main, lang, pipeline): if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['sentence_tokenization', 'tokenization']: nlp_pipelines = ['sentencizer'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] # Languages with models if lang in [ 'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other' ]: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) # Other Languages elif lang == 'other': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # Languages without models else: # Serbian (Cyrillic) & Serbian (Latin) if lang in ['srp_cyrl', 'srp_latn']: main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs') main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs') else: main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank( wordless_conversion.to_iso_639_1(main, lang)) if 'sentencizer' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sentencizer' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
def check_spacy_models(main, lang, pipeline): if lang == 'other': lang = 'eng' if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['tokenization', 'sentence_tokenization']: nlp_pipelines = ['sbd'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] if lang in ['nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa']: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) if 'sbd' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sbd' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
def wordEmbedding(data): model = Word2Vec.load('word2vec/common/word2vec/models/wiki_iter=5_algorithm=skipgram_window=10_size=300_neg-samples=10.m') # word2vec model from italian word embedding nlp=it_core_news_sm.load() stop_word = ['e','o','','il','lo','la','un','uno','una','mia','mio','tuo','con','su','per','tra','fra','please','aiuto','urgente','help',',','.','..','...','....',':',';','!','(',')','-',' ','/','"'] tok = list() row = list() sentence = np.zeros(50) matrix3D = np.zeros((len(data),50,300)) indexs=0 badToken=0 goodToken=0 nAns=0 for row in data: frase= row.lower() print("domanda = ", frase) sentence = nlp(frase) tokIndex=0 nAns += 1 for tok in sentence: print([(tok.text, tok.pos_) ]) #word = tok.lemma_ word = tok.text.lower() g_vec =[] flag = True #delete stopWords if word in stop_word: flag = False #delete numbers elif tok.pos == NUM: flag = False elif word == 'è': word=tok.lemma_ if flag==True: if model.wv.__contains__(word): g_vec = model.wv.__getitem__(word) goodToken += 1 else: #set a random word embedding fon unknown words badToken += 1 g_vec = model.wv.__getitem__(rn.choice(model.wv.index2entity)) if flag==True: tok2D = [] tok2D = g_vec if tokIndex < 50: matrix3D[indexs][tokIndex] = tok2D tokIndex += 1 indexs += 1 print ('I token trovati sono:' ,goodToken) print ('I token non trovati sono:' , badToken) print('Il vocabolario è: ', len(model.wv.vocab)) print('le domande trovate sono:', nAns) print(matrix3D.shape) return matrix3D
from nltk.corpus import wordnet as wn import fr_core_news_sm nlp_fr = fr_core_news_sm.load() import en_core_web_sm nlp_en = en_core_web_sm.load() import de_core_news_sm nlp_de = de_core_news_sm.load() import es_core_news_sm nlp_es = es_core_news_sm.load() import it_core_news_sm nlp_it = it_core_news_sm.load() import pt_core_news_sm nlp_pt = pt_core_news_sm.load() import nl_core_news_sm nlp_nl = nl_core_news_sm.load() # global variables wnl = WordNetLemmatizer() html_parser = HTMLParser() stopword_list = [] language = "" def init_lib(lang):
def lemmatize(text: str, nlp=it_core_news_sm.load()) -> str: """Convert words to their base form.""" doc = nlp(text) return " ".join([word.lemma_ if word.lemma_ != "-PRON-" else word.lower_ for word in doc])
def make_nlp(): return it_core_news_sm.load() # spacy.load('it_core_news_sm')
from Preprocessing import * import it_core_news_sm import re nlp = it_core_news_sm.load() def tokenizer_FASTTEXT(doc): tokenize = [] new_verse = [] for x in doc: verse = nlp(x) new_verse = [] for w in verse: regex = re.compile(r'( +|\'|\-|\,|\!|\:|\;|\?|\.|\(|\)|\«|\»|\")') if not regex.match(w.text): w_lower = w.text.casefold() new_verse.append(w_lower) tokenize.append(" ".join(new_verse)) return tokenize df_train = pd.concat([cv_text, dev_text]) train_emotion = np.concatenate([emotion, dev_emotion]) cv_tokenized = tokenizer_FASTTEXT(cv_text) dev_tokenized = tokenizer_FASTTEXT(dev_text) test_tokenized = tokenizer_FASTTEXT(test_text) train_tokenized = tokenizer_FASTTEXT(df_train) #prepare dataset for fasttext
from Preprocessing import * from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation as LDiA from sklearn.preprocessing import StandardScaler import it_core_news_sm import re sc = StandardScaler() nlp = it_core_news_sm.load() # the SpaCy Italian Tokenizer def italian_tokenizer(verse): tokenized = [] doc = nlp(verse) for w in doc: regex = re.compile(r'( +|\'|\-|\,|\!|\:|\;|\?|\.|\(|\)|\«|\»|\")') if not regex.match(w.text): w_lower = w.text.casefold() tokenized.append(w_lower) return tokenized # Words - TF-IDF vectorizer = TfidfVectorizer(tokenizer=italian_tokenizer) cv_tfidf = vectorizer.fit_transform(cv_text).toarray() cv_tfidf = pd.DataFrame(cv_tfidf) dev_tfidf = vectorizer.transform(dev_text).toarray() dev_tfidf = pd.DataFrame(dev_tfidf) vectorizer2 = TfidfVectorizer(tokenizer=italian_tokenizer)
def wordEmbedding(data): cap_path = datapath('cc.it.300.bin') modelfast = gs.models.fasttext.load_facebook_vectors(cap_path) nlp=it_core_news_sm.load() stop_word = ['e','o','','il','lo','la','un','uno','una','mia','mio','tuo','con','su','per','tra','fra','please','aiuto','urgente','help',',','.','..','...','....',':',';','!','(',')','-',' ','/','"'] tok = list() row = list() sentence = np.zeros(50) matrix3D = np.zeros((len(data),50,300)) indexs=0 badToken=0 goodToken=0 nAns=0 for row in data: frase = row.lower() print("domanda = ", frase) sentence = nlp(frase) tokIndex=0 nAns += 1 for tok in sentence: print([(tok.text, tok.pos_) ]) #word = tok.lemma_ word = tok.text.lower() g_vec =[] flag = True #delete stopWords if word in stop_word: flag = False #delete numbers elif tok.pos == NUM: flag = False elif word == 'è': word=tok.lemma_ if flag==True: if modelfast.wv.__contains__(word): g_vec = modelfast.wv.__getitem__(word) goodToken += 1 #print(g_vec[:300]) #set a random word embedding fon unknown words else: badToken += 1 g_vec = modelfast.wv.__getitem__(rn.choice(modelfast.wv.index2entity)) #print(g_vec [:300]) if flag==True: # VECTOR FOR EACH TOKEN tok2D = [] tok2D = g_vec #print(tok2D[:300]) if tokIndex < 50: matrix3D[indexs][tokIndex] = tok2D tokIndex += 1 indexs += 1 print ('I token trovati sono:' ,goodToken) print ('I token non trovati sono:' , badToken) print('Il vocabolario è: ', len(modelfast.wv.vocab)) print('le domande trovate sono:', nAns) print(matrix3D.shape) return matrix3D