def predict_spans(model, text: str, language: str = "en"): if language == "en": nlp = English() elif language == "el": nlp = Greek() elif language == "da": nlp = Danish() elif language == "ar": nlp = Arabic() else: nlp = MultiLanguage() tokenizer = nlp.tokenizer tokens = tokenizer(text) sentences = [] tokenised_text = [] cleaned_tokens = [] cleaned_index = 0 for token in tokens: if not token.text.isspace(): tokenised_text.append(token.text) indexed_token = IndexedToken(token, cleaned_index) cleaned_tokens.append(indexed_token) cleaned_index += 1 else: indexed_token = IndexedToken(token, token.i) cleaned_tokens.append(indexed_token) sentences.append(tokenised_text) predictions, raw_outputs = model.predict(sentences, split_on_space=False) span_predictions = [] sentence_prediction = predictions[0] for cleaned_token in cleaned_tokens: if cleaned_token.clean_index >= len(sentence_prediction): break if cleaned_token.token.text.isspace(): continue word_prediction = sentence_prediction[cleaned_token.clean_index] toxicness = word_prediction[cleaned_token.token.text] if toxicness == "TOXIC": location = cleaned_token.token.idx if len(span_predictions) > 0: last_index = span_predictions[-1] if location == last_index + 2: span_predictions.append(location - 1) length = len(cleaned_token.token.text) for i in range(length): span_predictions.append(location + i) return span_predictions
from string import digits import nltk from nltk import bigrams import spacy from spacy.lang.da import Danish from wordcloud import WordCloud ################################################################################################ ## PREPARE DATA FUNCTIONS ################################################################################################ activated = spacy.prefer_gpu() sp = spacy.load('da_core_news_lg') nlp = Danish() tokenizer = nlp.tokenizer file = open("stop_words.txt","r+") stop_words = file.read().split() # Tokenize and Lemmatize stop words joint_stops = " ".join(stop_words) tokenized = tokenizer(joint_stops).doc.text stops = sp(tokenized) my_stop_words = [t.lemma_ for t in stops] my_stop_words = list(set(my_stop_words)) def extract_hashtags(row): unique_hashtag_list = list(re.findall(r'#\S*\w', row["text"])) return unique_hashtag_list
def da_nlp(): return Danish()
import spacy from spacy.lang.da import Danish from spacy.lang.en import English import os os.chdir("C:/Users/NHJ/Desktop/playground/DanishPreProcessing/") nlp_en = English() nlp_da = Danish() doc = nlp_da(u'Fisk er en god spise, især på en torsdag med kage.') for token in doc: print(f"{token.text} | {token.lemma_} | {token.pos_}") # Get POS data import xml.etree.ElementTree as etree from bs4 import BeautifulSoup TRAIN_DATA = [('Who is Shaka Khan?', { 'entities': [(7, 17, 'PERSON')] }), ('I like London and Berlin.', { 'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')] })] ddt_data_filename = 'data/ddt-1.0.xml'
import collections import os import re import typing from gensim.summarization.textcleaner import split_sentences from spacy.lang.da import Danish from .decoder import decode_simple DANISH_SPACY_TOKENIZER = Danish() def get_ngrams(sent, n=2, as_string=False): """ Given a sentence (as a string or a list of words), return all ngrams of order n in a list of tuples [(w1, w2), (w2, w3), ... ] bounds=True includes <start> and <end> tags in the ngram list """ ngrams = [] words = sent.split() if n == 1: return words N = len(words) for i in range(n - 1, N): ngram = words[i - n + 1:i + 1] if as_string: ngrams.append("_".join(ngram)) else:
def __init__(self): self.nlp = Danish() self.sentiment_cls = Afinn(language="da")