Beispiel #1
0
def predict_spans(model, text: str, language: str = "en"):
    if language == "en":
        nlp = English()

    elif language == "el":
        nlp = Greek()

    elif language == "da":
        nlp = Danish()

    elif language == "ar":
        nlp = Arabic()

    else:
        nlp = MultiLanguage()

    tokenizer = nlp.tokenizer
    tokens = tokenizer(text)
    sentences = []
    tokenised_text = []
    cleaned_tokens = []
    cleaned_index = 0
    for token in tokens:
        if not token.text.isspace():
            tokenised_text.append(token.text)
            indexed_token = IndexedToken(token, cleaned_index)
            cleaned_tokens.append(indexed_token)
            cleaned_index += 1
        else:
            indexed_token = IndexedToken(token, token.i)
            cleaned_tokens.append(indexed_token)

    sentences.append(tokenised_text)

    predictions, raw_outputs = model.predict(sentences, split_on_space=False)
    span_predictions = []
    sentence_prediction = predictions[0]

    for cleaned_token in cleaned_tokens:

        if cleaned_token.clean_index >= len(sentence_prediction):
            break

        if cleaned_token.token.text.isspace():
            continue

        word_prediction = sentence_prediction[cleaned_token.clean_index]
        toxicness = word_prediction[cleaned_token.token.text]
        if toxicness == "TOXIC":
            location = cleaned_token.token.idx
            if len(span_predictions) > 0:
                last_index = span_predictions[-1]
                if location == last_index + 2:
                    span_predictions.append(location - 1)
            length = len(cleaned_token.token.text)
            for i in range(length):
                span_predictions.append(location + i)
    return span_predictions
Beispiel #2
0
from string import digits

import nltk
from nltk import bigrams

import spacy
from spacy.lang.da import Danish

from wordcloud import WordCloud

################################################################################################
## PREPARE DATA FUNCTIONS
################################################################################################
activated = spacy.prefer_gpu()
sp = spacy.load('da_core_news_lg')
nlp = Danish()
tokenizer = nlp.tokenizer

file = open("stop_words.txt","r+")
stop_words = file.read().split()

# Tokenize and Lemmatize stop words
joint_stops = " ".join(stop_words)
tokenized = tokenizer(joint_stops).doc.text
stops = sp(tokenized)
my_stop_words = [t.lemma_ for t in stops]
my_stop_words = list(set(my_stop_words))

def extract_hashtags(row):
    unique_hashtag_list = list(re.findall(r'#\S*\w', row["text"]))
    return unique_hashtag_list
Beispiel #3
0
def da_nlp():
    return Danish()
Beispiel #4
0
import spacy

from spacy.lang.da import Danish
from spacy.lang.en import English

import os

os.chdir("C:/Users/NHJ/Desktop/playground/DanishPreProcessing/")

nlp_en = English()

nlp_da = Danish()

doc = nlp_da(u'Fisk er en god spise, især på en torsdag med kage.')

for token in doc:
    print(f"{token.text} | {token.lemma_} | {token.pos_}")

# Get POS data

import xml.etree.ElementTree as etree
from bs4 import BeautifulSoup

TRAIN_DATA = [('Who is Shaka Khan?', {
    'entities': [(7, 17, 'PERSON')]
}),
              ('I like London and Berlin.', {
                  'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
              })]

ddt_data_filename = 'data/ddt-1.0.xml'
Beispiel #5
0
import collections
import os
import re
import typing

from gensim.summarization.textcleaner import split_sentences
from spacy.lang.da import Danish

from .decoder import decode_simple

DANISH_SPACY_TOKENIZER = Danish()


def get_ngrams(sent, n=2, as_string=False):
    """
	Given a sentence (as a string or a list of words), return all ngrams
	of order n in a list of tuples [(w1, w2), (w2, w3), ... ]
	bounds=True includes <start> and <end> tags in the ngram list
	"""

    ngrams = []
    words = sent.split()
    if n == 1:
        return words

    N = len(words)
    for i in range(n - 1, N):
        ngram = words[i - n + 1:i + 1]
        if as_string:
            ngrams.append("_".join(ngram))
        else:
Beispiel #6
0
 def __init__(self):
     self.nlp = Danish()
     self.sentiment_cls = Afinn(language="da")