def tokenize_and_stem(text): ## check if eng or dk filtered_text = re.sub('[^\w\-\'/]', ' ', text) lang = detect(filtered_text) if lang in valid_languages: tokens = filtered_text.lower().split(" ") processed = [] stopwords = [] if lang == "da": snowball = DanishStemmer() stopwords = dk_stop if lang == "en": snowball = EnglishStemmer() stopwords = eng_stop for token in tokens: if token in stopwords: continue elif "\n" in token: continue elif "\\n" in token: continue elif token == "": continue elif token.isdigit(): continue else: processed.append(token) stemmed = [] for token in processed: stemmed.append(snowball.stem(token)) return stemmed
def word_normalize(s: str, method: str = "l") -> str: """ Splits a string and lemmatizes every single word, except acronyms """ if method not in ["s", "l"]: raise ValueError("Method must be either 's' or 'l' for either" "stemming or lemmatizing") # TODO: change this to match language lemmatizer = lemmy.load("da") stemmer = DanishStemmer() words = s.split(" ") norm_words = [] for w in words: if w.isupper(): norm_words.append(w) else: if method == "l": w = lemmatizer.lemmatize("", w) norm_words.extend(w) else: w = stemmer.stem(w) norm_words.append(w) return " ".join(norm_words)
def __init__(self): """Set up tokenizer.""" self.logger = logging.getLogger(__name__ + '.Corpus') self.logger.addHandler(logging.NullHandler()) self.logger.debug('Setup word tokenizer') self.word_tokenizer = WordPunctTokenizer() self.logger.debug('Setup stemmer') self.stemmer = DanishStemmer()
def __init__(self): """Initialize logger and and database.""" self.logger = logging.getLogger(__name__ + '.Dannet') self.logger.addHandler(logging.NullHandler()) self.logger.debug('Initializing tokenizer and stemmer') self.word_tokenizer = WordPunctTokenizer() self.stemmer = DanishStemmer() self._db = None
def __init__(self): """Set up data directory and other attributes.""" self.logger = logging.getLogger('dasem.gutenberg.Gutenberg') self.logger.addHandler(logging.NullHandler()) self.data_directory = join(data_directory(), 'gutenberg', 'aleph.gutenberg.org') self.sentence_tokenizer = nltk.data.load( 'tokenizers/punkt/danish.pickle') self.whitespaces_pattern = re.compile('\s+', flags=re.DOTALL | re.UNICODE) self.word_tokenizer = WordPunctTokenizer() self.stemmer = DanishStemmer()
def token(X, words_only=False, word_normalize=True, emoji_normalize=True, remove_digits=True, lower_case=True, stop_words=None): ''' requires Stemming if word_normalize = True ''' # eyes [nose] mouth | mouth [nose] eyes pattern emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)" emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE) # Keep word only. Digit are consider true Emojis false if words_only: clean_text = re.sub('[\W]+', ' ', X) else: clean_text = '{}{}'.format(re.sub('[\W]+', ' ', X), ''.join(re.findall(emoticon_re, X))) # normalize emoji? if emoji_normalize: clean_text = (re.sub('[\W]+', ' ', X) + ' '.join( re.findall(emoticon_re, X)).replace(';', ':').replace('-', '')) if remove_digits: clean_text = clean_text.translate(str.maketrans('', '', '0123456789')) if lower_case: clean_text = clean_text.lower() if word_normalize: stemmer = DanishStemmer() clean_text = ' '.join( stemmer.stem(word) for word in clean_text.split()) if stop_words: return [word for word in clean_text.split() if word not in stop_words] else: return clean_text.split()
def __init__(self, danish_filename=DANISH_FILENAME, tar_gz_filename=TGZ_PARALLEL_CORPUS_FILENAME): """Set up filename. Parameters ---------- danish_filename : str Filename for '.da' file in the tar.gz file. tar_gz_filename : str Filename for tar.gz or tgz file with Danish/English. """ self.logger = logging.getLogger(__name__ + '.Europarl') self.logger.addHandler(logging.NullHandler()) self.tar_gz_filename = tar_gz_filename self.danish_filename = danish_filename self.word_tokenizer = WordPunctTokenizer() self.stemmer = DanishStemmer()
def __init__(self, filename=BZ2_XML_DUMP_FILENAME): """Prepare dump file for reading. Parameters ---------- filename : str Filename or the XML dump file. """ self.logger = logging.getLogger(__name__) self.logger.addHandler(logging.NullHandler()) full_filename = self.full_filename(filename) self.filename = full_filename self.sentence_tokenizer = nltk.data.load( 'tokenizers/punkt/danish.pickle') self.whitespaces_pattern = re.compile( '\s+', flags=re.DOTALL | re.UNICODE) self.word_tokenizer = WordPunctTokenizer() self.stemmer = DanishStemmer() self.word_pattern = re.compile( r"""{{.+?}}| <!--.+?-->| \[\[Fil.+?\]\]| \[\[Kategori:.+?\]\]| \[http.+?\]|(\w+(?:-\w+)*)""", flags=re.UNICODE | re.VERBOSE | re.DOTALL) self.paragraph_split_pattern = re.compile( r'\n\s*\n', flags=re.DOTALL | re.UNICODE) self.ignored_words_pattern = re.compile( r""" (?:(?:thumb|thumbnail|left|right|\d+px|upright(?:=[0-9\.]+)?)\|)+ |^\s*\|.+$ |^REDIRECT\b""", flags=re.DOTALL | re.UNICODE | re.VERBOSE | re.MULTILINE) self.itemized_split_pattern = re.compile( r"^ |^Kategori:", flags=re.DOTALL | re.UNICODE | re.MULTILINE)
def __init__(self, lib_path=THIRD_PATY_PATH): """Constructor. Initialize class attributes. """ self.word2idx = {} self.idx2word = {} self.words = [] self.raw_data = [] self.seq_data = None self.count = None self.word_ctx = defaultdict(set) self.pw_data = [] self.docs = defaultdict(dict) self.lines = defaultdict(list) self.re_rules = [] self.lib_path = lib_path self.ner_tagger = self.set_ner_tagger(NER_MODEL, NER_JAR) self.pos_tagger = self.set_pos_tagger(POS_MODEL, POS_JAR) self.pt_stemmer = PorterStemmer() self.dan_stemmer = DanishStemmer() self.lemma = WordNetLemmatizer() self.tfidf = TfidfVectorizer(tokenizer=self.tokenize, stop_words='english')
def preprocess_text(text): # text = re.sub(r'[^\w\s]', '', str(text).lower().strip()) text = str(text).lower().strip() # caveat: this might conflict with the english text da_stop_words = stopwords.words('danish') stemmer = DanishStemmer() lemmatizer = lemmy.load("da") # remove plurals textblob = TextBlob(text) singles = [stemmer.stem(word) for word in textblob.words] # remove danish stopwords no_stop_words = [word for word in singles if word not in da_stop_words] # join text so it can be lemmatized joined_text = " ".join(no_stop_words) # lemmatization final_text = lemmatizer.lemmatize("", joined_text) return final_text[0]
def convert_text_to_rouge_format(text, title="dummy title"): """ Convert a text to a format ROUGE understands. The text is assumed to contain one sentence per line. text: The text to convert, containg one sentence per line. title: Optional title for the text. The title will appear in the converted file, but doesn't seem to have any other relevance. Returns: The converted text as string. """ # sentences = text.split("\n") from nltk.stem.snowball import DanishStemmer stemmer = DanishStemmer() sentences = text.split("<q>") output = [] for sentence in sentences: output.append(" ".join([stemmer.stem(i) for i in sentence.split()])) sent_elems = [ "<a name=\"{i}\">[{i}]</a> <a href=\"#{i}\" id={i}>" "{text}</a>".format(i=i, text=sent) for i, sent in enumerate(output, start=1) ] html = """<html> <head> <title>{title}</title> </head> <body bgcolor="white"> {elems} </body> </html>""".format(title=title, elems="\n".join(sent_elems)) return html
from flask import Flask, request, render_template from sklearn.externals import joblib import nltk from nltk.corpus import stopwords from nltk.stem.snowball import DanishStemmer nltk.download('stopwords') stopwords = stopwords.words('danish') stemmer = DanishStemmer() def text_process(name): """ Tekstprocessering som laver om til små bogstaver, fjerner stopord og finder ordstammen """ lst = name.lower().split(' ') stop = [word for word in lst if word not in stopwords] stem = [stemmer.stem(word) for word in stop] return stem pipeline = joblib.load('model/predict_business.pkl') def predict_business(name): return pipeline.predict([name])[0] app = Flask(__name__)
def __init__(self): self._read_sto_mapping() self._read_sto_words() self._stemmer = DanishStemmer()
def stem_lem(words, documents, stem_or_lem: bool = False): """ Updates a word list and a corpus to use stemmed words. :param stem_or_lem: bool indicating whether to apply stemming or lemmatizer. True is stem, False is lem. :param corpus: a list of sentences (strings of words separated by spaces) :param words: a list of words :return: new corpus and words list, were all words have been replaced by stemmed/lemmetized versions. """ stop_words = stopwords.words('danish') stop_words.extend( list(utility.load_vector_file("word_datasets/stopwords.csv").values())) if stem_or_lem: # Stemming stemmer = DanishStemmer() # Update word list to use stemmed words translator = {} add = [] remove = [] for word in tqdm(words): stem = stemmer.stem(word) if stem != word: if word not in remove: remove.append(word) if stem not in add and stem not in stop_words: add.append(stem) if word not in translator and stem not in stop_words: translator[word] = stem words = [x for x in words if x not in remove] words.extend([x for x in add if x not in words]) else: lemmer = lemmy.load("da") # build up dictionary that translates old words into their new versions translator = {} add = [] remove = [] for word in tqdm(words): lem = lemmer.lemmatize("", word) other = [x for x in lem if x != word] if len(other) > 0: if word not in lem and word not in remove: remove.append(word) # add all lem options if they are not stopwords add.extend( [x for x in lem if x not in stop_words and x not in add]) if word not in translator and lem not in stop_words: lem = " ".join(lem) translator[word] = lem words = [x for x in words if x not in remove] words.extend([x for x in add if x not in words]) # update corpus to use stemmed words for x in tqdm(range(len(documents))): sentence = documents[x] for i in range(len(sentence)): word = sentence[i] if word in translator: sentence[i] = translator[word] sentence = ' '.join(sentence) sentence = sentence.split(' ') documents[x] = sentence diction = gensim.corpora.Dictionary(documents) d_words = diction.token2id good_ids = [d_words[x] for x in words] diction.filter_tokens(good_ids=good_ids) diction.compactify() return diction, documents