def word_normalize(s: str, method: str = "l") -> str: """ Splits a string and lemmatizes every single word, except acronyms """ if method not in ["s", "l"]: raise ValueError("Method must be either 's' or 'l' for either" "stemming or lemmatizing") # TODO: change this to match language lemmatizer = lemmy.load("da") stemmer = DanishStemmer() words = s.split(" ") norm_words = [] for w in words: if w.isupper(): norm_words.append(w) else: if method == "l": w = lemmatizer.lemmatize("", w) norm_words.extend(w) else: w = stemmer.stem(w) norm_words.append(w) return " ".join(norm_words)
def tokenize_and_stem(text): ## check if eng or dk filtered_text = re.sub('[^\w\-\'/]', ' ', text) lang = detect(filtered_text) if lang in valid_languages: tokens = filtered_text.lower().split(" ") processed = [] stopwords = [] if lang == "da": snowball = DanishStemmer() stopwords = dk_stop if lang == "en": snowball = EnglishStemmer() stopwords = eng_stop for token in tokens: if token in stopwords: continue elif "\n" in token: continue elif "\\n" in token: continue elif token == "": continue elif token.isdigit(): continue else: processed.append(token) stemmed = [] for token in processed: stemmed.append(snowball.stem(token)) return stemmed
def __init__(self): """Set up tokenizer.""" self.logger = logging.getLogger(__name__ + '.Corpus') self.logger.addHandler(logging.NullHandler()) self.logger.debug('Setup word tokenizer') self.word_tokenizer = WordPunctTokenizer() self.logger.debug('Setup stemmer') self.stemmer = DanishStemmer()
def __init__(self): """Initialize logger and and database.""" self.logger = logging.getLogger(__name__ + '.Dannet') self.logger.addHandler(logging.NullHandler()) self.logger.debug('Initializing tokenizer and stemmer') self.word_tokenizer = WordPunctTokenizer() self.stemmer = DanishStemmer() self._db = None
def __init__(self): """Set up data directory and other attributes.""" self.logger = logging.getLogger('dasem.gutenberg.Gutenberg') self.logger.addHandler(logging.NullHandler()) self.data_directory = join(data_directory(), 'gutenberg', 'aleph.gutenberg.org') self.sentence_tokenizer = nltk.data.load( 'tokenizers/punkt/danish.pickle') self.whitespaces_pattern = re.compile('\s+', flags=re.DOTALL | re.UNICODE) self.word_tokenizer = WordPunctTokenizer() self.stemmer = DanishStemmer()
def token(X, words_only=False, word_normalize=True, emoji_normalize=True, remove_digits=True, lower_case=True, stop_words=None): ''' requires Stemming if word_normalize = True ''' # eyes [nose] mouth | mouth [nose] eyes pattern emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)" emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE) # Keep word only. Digit are consider true Emojis false if words_only: clean_text = re.sub('[\W]+', ' ', X) else: clean_text = '{}{}'.format(re.sub('[\W]+', ' ', X), ''.join(re.findall(emoticon_re, X))) # normalize emoji? if emoji_normalize: clean_text = (re.sub('[\W]+', ' ', X) + ' '.join( re.findall(emoticon_re, X)).replace(';', ':').replace('-', '')) if remove_digits: clean_text = clean_text.translate(str.maketrans('', '', '0123456789')) if lower_case: clean_text = clean_text.lower() if word_normalize: stemmer = DanishStemmer() clean_text = ' '.join( stemmer.stem(word) for word in clean_text.split()) if stop_words: return [word for word in clean_text.split() if word not in stop_words] else: return clean_text.split()
class Corpus(with_metaclass(ABCMeta)): """Abstract class for corpus.""" def __init__(self): """Set up tokenizer.""" self.logger = logging.getLogger(__name__ + '.Corpus') self.logger.addHandler(logging.NullHandler()) self.logger.debug('Setup word tokenizer') self.word_tokenizer = WordPunctTokenizer() self.logger.debug('Setup stemmer') self.stemmer = DanishStemmer() def iter_sentence_words(self, lower=True, stem=False): """Yield list of words from sentences. Parameters ---------- lower : bool, default True Lower case the words. stem : bool, default False Apply word stemming. DanishStemmer from nltk is used. Yields ------ words : list of str List of words """ for sentence in self.iter_sentences(): words = self.word_tokenizer.tokenize(sentence) if lower: words = [word.lower() for word in words] if stem: words = [self.stemmer.stem(word) for word in words] yield words def iter_tokenized_sentences(self, lower=True, stem=False): """Yield string with tokenized sentences. Parameters ---------- lower : bool, default True Lower case the words. stem : bool, default False Apply word stemming. DanishStemmer from nltk is used. Yields ------ tokenized_sentence : str Sentence as string with tokens separated by a whitespace. """ for words in self.iter_sentence_words(lower=lower, stem=stem): tokenized_sentence = u(" ").join(words) yield tokenized_sentence
def __init__(self, danish_filename=DANISH_FILENAME, tar_gz_filename=TGZ_PARALLEL_CORPUS_FILENAME): """Set up filename. Parameters ---------- danish_filename : str Filename for '.da' file in the tar.gz file. tar_gz_filename : str Filename for tar.gz or tgz file with Danish/English. """ self.logger = logging.getLogger(__name__ + '.Europarl') self.logger.addHandler(logging.NullHandler()) self.tar_gz_filename = tar_gz_filename self.danish_filename = danish_filename self.word_tokenizer = WordPunctTokenizer() self.stemmer = DanishStemmer()
class StoLemmatizer(object): def __init__(self): self._read_sto_mapping() self._read_sto_words() self._stemmer = DanishStemmer() def _read_sto_mapping(self): self.sto_to_uni = {} with codecs.open(os.path.join(__location__, "da-sto.map"), encoding='utf-8') as f: for line in f: sto, uni = line.strip().split("\t") self.sto_to_uni[sto] = uni def _read_sto_words(self): self.lookup_form_and_pos = {} self.lookup_form = {} with codecs.open(os.path.join(__location__, "STOposUTF8.txt"), encoding='utf-8') as f: for line in f: form, lemma, pos = line.strip().split("\t") self.lookup_form_and_pos[(form.lower(), self.sto_to_uni[pos])] = lemma.lower() self.lookup_form_and_pos[(form.lower(), None)] = lemma.lower() self.lookup_form[form.lower()] = lemma.lower() def lemmatize(self, form, pos=None): """ Look-up word form with optional part-of-speech (17- UD tagset). The method implements a fall-back strategy. When a match with the correct part of speech cannot be found, it tries to match the word form with any part of speech. If this also fails, the word is stemmed (using the Snowball stemmer) instead of lemmatized. :param form: :param pos: :return: """ form = form.lower() if pos in ('NUM', 'PUNCT', 'X', 'INTJ', 'SYM', 'PROPN'): return form if pos == "AUX": pos = "VERB" return self.lookup_form_and_pos.get((form, pos)) \ or self.lookup_form_and_pos.get((form, None)) \ or self._stemmer.stem(form)
def preprocess_text(text): # text = re.sub(r'[^\w\s]', '', str(text).lower().strip()) text = str(text).lower().strip() # caveat: this might conflict with the english text da_stop_words = stopwords.words('danish') stemmer = DanishStemmer() lemmatizer = lemmy.load("da") # remove plurals textblob = TextBlob(text) singles = [stemmer.stem(word) for word in textblob.words] # remove danish stopwords no_stop_words = [word for word in singles if word not in da_stop_words] # join text so it can be lemmatized joined_text = " ".join(no_stop_words) # lemmatization final_text = lemmatizer.lemmatize("", joined_text) return final_text[0]
def convert_text_to_rouge_format(text, title="dummy title"): """ Convert a text to a format ROUGE understands. The text is assumed to contain one sentence per line. text: The text to convert, containg one sentence per line. title: Optional title for the text. The title will appear in the converted file, but doesn't seem to have any other relevance. Returns: The converted text as string. """ # sentences = text.split("\n") from nltk.stem.snowball import DanishStemmer stemmer = DanishStemmer() sentences = text.split("<q>") output = [] for sentence in sentences: output.append(" ".join([stemmer.stem(i) for i in sentence.split()])) sent_elems = [ "<a name=\"{i}\">[{i}]</a> <a href=\"#{i}\" id={i}>" "{text}</a>".format(i=i, text=sent) for i, sent in enumerate(output, start=1) ] html = """<html> <head> <title>{title}</title> </head> <body bgcolor="white"> {elems} </body> </html>""".format(title=title, elems="\n".join(sent_elems)) return html
class StoLemmatizer(object): def __init__(self): self._read_sto_mapping() self._read_sto_words() self._stemmer = DanishStemmer() def _read_sto_mapping(self): self.sto_to_uni = {} with codecs.open(os.path.join(__location__, "da-sto.map"), encoding='utf-8') as f: for line in f: sto, uni = line.strip().split("\t") self.sto_to_uni[sto] = uni def _read_sto_words(self): self.lookup_form_and_pos = {} self.lookup_form = {} with codecs.open(os.path.join(__location__, "STOposUTF8.txt"), encoding='utf-8') as f: for line in f: form, lemma, pos = line.strip().split("\t") self.lookup_form_and_pos[( form.lower(), self.sto_to_uni[pos])] = lemma.lower() self.lookup_form_and_pos[(form.lower(), None)] = lemma.lower() self.lookup_form[form.lower()] = lemma.lower() def lemmatize(self, form, pos=None): """ Look-up word form with optional part-of-speech (universal tagset). The method implements a fall-back strategy. When a match with the correct part of speech cannot be found, it tries to match the word form with any part of speech. If this also fails, the word is stemmed (using the Snowball stemmer) instead of lemmatized. :param form: :param pos: :return: """ form = form.lower() if pos in ('NUM', '.', 'X'): return form return self.lookup_form_and_pos.get((form, pos)) \ or self.lookup_form_and_pos.get((form, None)) \ or self._stemmer.stem(form)
def __init__(self, filename=BZ2_XML_DUMP_FILENAME): """Prepare dump file for reading. Parameters ---------- filename : str Filename or the XML dump file. """ self.logger = logging.getLogger(__name__) self.logger.addHandler(logging.NullHandler()) full_filename = self.full_filename(filename) self.filename = full_filename self.sentence_tokenizer = nltk.data.load( 'tokenizers/punkt/danish.pickle') self.whitespaces_pattern = re.compile( '\s+', flags=re.DOTALL | re.UNICODE) self.word_tokenizer = WordPunctTokenizer() self.stemmer = DanishStemmer() self.word_pattern = re.compile( r"""{{.+?}}| <!--.+?-->| \[\[Fil.+?\]\]| \[\[Kategori:.+?\]\]| \[http.+?\]|(\w+(?:-\w+)*)""", flags=re.UNICODE | re.VERBOSE | re.DOTALL) self.paragraph_split_pattern = re.compile( r'\n\s*\n', flags=re.DOTALL | re.UNICODE) self.ignored_words_pattern = re.compile( r""" (?:(?:thumb|thumbnail|left|right|\d+px|upright(?:=[0-9\.]+)?)\|)+ |^\s*\|.+$ |^REDIRECT\b""", flags=re.DOTALL | re.UNICODE | re.VERBOSE | re.MULTILINE) self.itemized_split_pattern = re.compile( r"^ |^Kategori:", flags=re.DOTALL | re.UNICODE | re.MULTILINE)
def __init__(self, lib_path=THIRD_PATY_PATH): """Constructor. Initialize class attributes. """ self.word2idx = {} self.idx2word = {} self.words = [] self.raw_data = [] self.seq_data = None self.count = None self.word_ctx = defaultdict(set) self.pw_data = [] self.docs = defaultdict(dict) self.lines = defaultdict(list) self.re_rules = [] self.lib_path = lib_path self.ner_tagger = self.set_ner_tagger(NER_MODEL, NER_JAR) self.pos_tagger = self.set_pos_tagger(POS_MODEL, POS_JAR) self.pt_stemmer = PorterStemmer() self.dan_stemmer = DanishStemmer() self.lemma = WordNetLemmatizer() self.tfidf = TfidfVectorizer(tokenizer=self.tokenize, stop_words='english')
class Gutenberg(Corpus): """Gutenberg. Interface to Gutenberg. The data will be mirrored/downloaded to a directory like: ~/dasem_data/gutenberg/aleph.gutenberg.org In regard to encoding of the Project Gutenberg texts: For instance, 10218 is encoded in "ISO Latin-1". This is stated with the line "Character set encoding: ISO Latin-1" in the header of the data file. Attributes ---------- data_directory : str Top directory where the text are mirrored. logger : logging.Logger Logging object. stemmer : object with stem method Object with stem method corresponding to nltk.stem.snowball.DanishStemmer. sentence_tokenizer . object with tokenize method Object with tokenize method for tokenizing a text into sentences. whitespaces_pattern : regex pattern Regular expression pattern. word_tokenizer : object with tokenize method Object with tokenize method, corresponding to nltk.WordPunctTokenizer. """ def __init__(self): """Set up data directory and other attributes.""" self.logger = logging.getLogger('dasem.gutenberg.Gutenberg') self.logger.addHandler(logging.NullHandler()) self.data_directory = join(data_directory(), 'gutenberg', 'aleph.gutenberg.org') self.sentence_tokenizer = nltk.data.load( 'tokenizers/punkt/danish.pickle') self.whitespaces_pattern = re.compile('\s+', flags=re.DOTALL | re.UNICODE) self.word_tokenizer = WordPunctTokenizer() self.stemmer = DanishStemmer() def data_directory(self): """Return diretory where data should be. Returns ------- dir : str Directory. """ dir = join(data_directory(), 'gutenberg') return dir def download(self, redownload=False): r"""Download corpus from Gutenberg homepage. This method will use the external 'wget' program that is the only download method the Project Gutenberg allows. This is explained on their homepage. The command is: wget -w 2 -m -H \ "http://www.gutenberg.org/robot/harvest?filetypes[]=txt&langs[]=da" This method will spawn a subprocess. The 'wget' program needs to be installed. Parameters ---------- redownload : bool, optional If True will attempt to download anew. Otherwise, the method tests whether a specific file exists on the local data directory. If the file exists, then no files are fetch from Gutenberg. References ---------- https://www.gutenberg.org/wiki/Gutenberg%3aInformation_About_Robot_Access_to_our_Pages """ self.make_data_directory() test_filename = join(self.data_directory, '1', '0', '2', '1', '10218', '10218-8.zip') if not redownload and isfile(test_filename): message = 'Not downloading as the file {} exists' self.logger.debug(message.format(test_filename)) return directory = split(self.data_directory)[0] self.logger.info( 'Downloading Danish Gutenberg corpus to {}'.format(directory)) call(['wget', '-w', '2', '-m', '-H', DOWNLOAD_URL], cwd=directory) self.logger.debug('Gutenberg corpus downloaded') def make_data_directory(self): """Make data directory for LCC.""" make_data_directory(data_directory(), 'gutenberg') def translate_aa(self, text): """Translate double-a to 'bolle-aa'. Parameters ---------- text : str Input text to be translated. Returns ------- translated_text : str Text with double-a translated to bolle-aa. """ return text.replace('aa', u('\xe5')).replace('Aa', u('\xc5')).replace( 'AA', u('\xc5')) def translate_whitespaces(self, text): r"""Translate multiple whitespaces to a single space. Parameters ---------- text : str Input string to be translated. Returns ------- translated_text : str String with multiple whitespaces translated to a single whitespace. Examples -------- >>> gutenberg = Gutenberg() >>> gutenberg.translate_whitespaces('\n Hello \n World \n') ' Hello World ' """ translated_text = self.whitespaces_pattern.sub(' ', text) return translated_text def get_all_ids(self): """Get all Gutenberg text ids from mirrored data. Returns ------- ids : list of str List of Gutenberg ebook identifiers. Examples -------- >>> gutenberg = Gutenberg() >>> '38080' in gutenberg.get_all_ids() True """ ids = [] for root, dirs, files in walk(self.data_directory): for file in files: if file.endswith('-8.zip'): ids.append(file[:-6]) return ids def get_text_by_id(self, id, extract_body=True): """Get text from mirrored Gutenberg archive. This function requires that the texts have been mirrored. Parameters ---------- id : str or integer Gutenberg ebook identifier. extract_body : bool, default True Extract the body of the downloaded/mirrored Gutenberg raw text. Returns ------- text : str Extracted text. The text is converted to Unicode. """ # Example on subdirectory structure: # www.gutenberg.lib.md.us/4/4/9/6/44967 s = str(id) l = list(s) if len(l) > 4: directory = join(self.data_directory, l[0], l[1], l[2], l[3], s) else: # For instance, id=9264 has only four-level subdirectories. # This might be because it is only 4 characters long directory = join(self.data_directory, l[0], l[1], l[2], s) zip_filename = join(directory, s + '-8.zip') self.logger.debug('Reading text from {}'.format(zip_filename)) with ZipFile(zip_filename) as zip_file: filename = join(s, s + '-8.txt') try: with zip_file.open(filename) as f: encoded_text = f.read() except KeyError: # There might be zip files where the data file is in the root filename = s + '-8.txt' with zip_file.open(filename) as f: encoded_text = f.read() if encoded_text.find(b('Character set encoding: ISO-8859-1')) != -1: text = encoded_text.decode('ISO-8859-1') elif encoded_text.find(b('Character set encoding: ISO Latin-1')) != -1: text = encoded_text.decode('Latin-1') else: raise LookupError('Unknown encoding for file {}'.format(filename)) if extract_body: extracted_text = extract_text(text) return extracted_text else: return text def iter_sentence_words(self, translate_aa=True, translate_whitespaces=True, lower=True, stem=False): """Yield list of words from sentences. Parameters ---------- translate_aa : bool, default True Translate double-a to 'bolle-aa'. translate_whitespaces : bool, default True Translate multiple whitespaces to single whitespaces lower : bool, default True Lower case the words. stem : bool, default False Apply word stemming. DanishStemmer from nltk is used. Yields ------ words : list of str List of words """ for sentence in self.iter_sentences( translate_aa=translate_aa, translate_whitespaces=translate_whitespaces): words = self.word_tokenizer.tokenize(sentence) if lower: words = [word.lower() for word in words] if stem: words = [self.stemmer.stem(word) for word in words] yield words def iter_sentences(self, translate_aa=True, translate_whitespaces=True): """Yield sentences. The method uses the NLTK Danish sentence tokenizer. Parameters ---------- translate_aa : bool, default True Translate double-aa to bolle-aa. translate_whitespaces : book, default True Translate multiple whitespaces to a single space. Yields ------ sentence : str String with sentences. Examples -------- >>> gutenberg = Gutenberg() >>> found = False >>> for sentence in gutenberg.iter_sentences(): ... if 'Indholdsfortegnelse.' == sentence: ... found = True ... break >>> found True """ for text in self.iter_texts(translate_aa=translate_aa): sentences = self.sentence_tokenizer.tokenize(text) for sentence in sentences: if translate_whitespaces: sentence = self.translate_whitespaces(sentence) yield sentence def iter_texts(self, translate_aa=True): """Yield texts. Parameters ---------- translate_aa : bool, default True Translate double-aa to bolle-aa. Yields ------ text : str Text. """ for id in self.get_all_ids(): text = self.get_text_by_id(id) if translate_aa: yield self.translate_aa(text) else: yield text
class Europarl(Corpus): """Europarl corpus. Examples -------- >>> europarl = Europarl() >>> sentence = next(europarl.iter_tokenized_sentences()) >>> "sessionen" in sentence.split() True """ def __init__(self, danish_filename=DANISH_FILENAME, tar_gz_filename=TGZ_PARALLEL_CORPUS_FILENAME): """Set up filename. Parameters ---------- danish_filename : str Filename for '.da' file in the tar.gz file. tar_gz_filename : str Filename for tar.gz or tgz file with Danish/English. """ self.logger = logging.getLogger(__name__ + '.Europarl') self.logger.addHandler(logging.NullHandler()) self.tar_gz_filename = tar_gz_filename self.danish_filename = danish_filename self.word_tokenizer = WordPunctTokenizer() self.stemmer = DanishStemmer() def data_directory(self): """Return diretory where data should be. Returns ------- directory : str Directory. """ directory = join(data_directory(), 'europarl') return directory def download(self, redownload=False): """Download corpus.""" filename = TGZ_PARALLEL_CORPUS_FILENAME local_filename = join(self.data_directory(), filename) if not redownload and isfile(local_filename): message = 'Not downloading as corpus already download to {}' self.logger.debug(message.format(local_filename)) return self.make_data_directory() url = TGZ_PARALLEL_CORPUS_URL self.logger.info('Downloading {} to {}'.format(url, local_filename)) response = requests.get(url, stream=True) with open(local_filename, 'wb') as fid: copyfileobj(response.raw, fid) self.logger.debug('Corpus downloaded'.format()) def iter_sentences(self): """Yield sentences. Yields ------ sentence : str Sentences as Unicode strings. """ full_tar_gz_filename = join(self.data_directory(), self.tar_gz_filename) with tarfile.open(full_tar_gz_filename, "r:gz") as tar: fid = tar.extractfile(self.danish_filename) for line in fid: yield line.decode('utf-8').strip() def iter_sentence_words(self, lower=True, stem=False): """Yield list of words from sentences. Parameters ---------- lower : bool, default True Lower case the words. stem : bool, default False Apply word stemming. DanishStemmer from nltk is used. Yields ------ words : list of str List of words """ for sentence in self.iter_sentences(): words = self.word_tokenizer.tokenize(sentence) if lower: words = [word.lower() for word in words] if stem: words = [self.stemmer.stem(word) for word in words] yield words def make_data_directory(self): """Make data directory for Europarl.""" make_data_directory(self.data_directory())
def __init__(self): self._read_sto_mapping() self._read_sto_words() self._stemmer = DanishStemmer()
def snowball(words): s = DanishStemmer() return [s.stem(w) for w in words]
def stem_lem(words, documents, stem_or_lem: bool = False): """ Updates a word list and a corpus to use stemmed words. :param stem_or_lem: bool indicating whether to apply stemming or lemmatizer. True is stem, False is lem. :param corpus: a list of sentences (strings of words separated by spaces) :param words: a list of words :return: new corpus and words list, were all words have been replaced by stemmed/lemmetized versions. """ stop_words = stopwords.words('danish') stop_words.extend( list(utility.load_vector_file("word_datasets/stopwords.csv").values())) if stem_or_lem: # Stemming stemmer = DanishStemmer() # Update word list to use stemmed words translator = {} add = [] remove = [] for word in tqdm(words): stem = stemmer.stem(word) if stem != word: if word not in remove: remove.append(word) if stem not in add and stem not in stop_words: add.append(stem) if word not in translator and stem not in stop_words: translator[word] = stem words = [x for x in words if x not in remove] words.extend([x for x in add if x not in words]) else: lemmer = lemmy.load("da") # build up dictionary that translates old words into their new versions translator = {} add = [] remove = [] for word in tqdm(words): lem = lemmer.lemmatize("", word) other = [x for x in lem if x != word] if len(other) > 0: if word not in lem and word not in remove: remove.append(word) # add all lem options if they are not stopwords add.extend( [x for x in lem if x not in stop_words and x not in add]) if word not in translator and lem not in stop_words: lem = " ".join(lem) translator[word] = lem words = [x for x in words if x not in remove] words.extend([x for x in add if x not in words]) # update corpus to use stemmed words for x in tqdm(range(len(documents))): sentence = documents[x] for i in range(len(sentence)): word = sentence[i] if word in translator: sentence[i] = translator[word] sentence = ' '.join(sentence) sentence = sentence.split(' ') documents[x] = sentence diction = gensim.corpora.Dictionary(documents) d_words = diction.token2id good_ids = [d_words[x] for x in words] diction.filter_tokens(good_ids=good_ids) diction.compactify() return diction, documents
def snowball_single(w): s = DanishStemmer() return s.stem(w)
class Dannet(Corpus, DataDirectoryMixin): """Dannet. Using the module will automagically download the data from the Dannet homepage (http://www.wordnet.dk). Attributes ---------- db : db.DB Database access through the db.py interface. Examples -------- >>> dannet = Dannet() >>> dannet.db.tables.words +---------------------------------------------------+ | words | +---------+---------+--------------+----------------+ | Column | Type | Foreign Keys | Reference Keys | +---------+---------+--------------+----------------+ | index | INTEGER | | | | word_id | TEXT | | | | form | TEXT | | | | pos | TEXT | | | +---------+---------+--------------+----------------+ >>> # From README >>> query = ''' ... SELECT w.form, ws.register, s.synset_id, s.gloss, s.ontological_type ... FROM synsets s, wordsenses ws, words w ... WHERE s.synset_id = ws.synset_id ... AND ws.word_id = w.word_id ... AND w.form = 'spand';''' >>> 'bil' in dannet.db.query(query).gloss[0] True >>> # Danish nouns >>> dannet = Dannet() >>> query = "select w.form from words w where w.pos = 'Noun'" >>> nouns = set(dannet.db.query(query).form) >>> 'guitar' in nouns True >>> 'guitaren' in nouns False >>> len(nouns) 48404 References ---------- - http://www.wordnet.dk """ def __init__(self): """Initialize logger and and database.""" self.logger = logging.getLogger(__name__ + '.Dannet') self.logger.addHandler(logging.NullHandler()) self.logger.debug('Initializing tokenizer and stemmer') self.word_tokenizer = WordPunctTokenizer() self.stemmer = DanishStemmer() self._db = None @property def db(self): """Return a db.py instance with DanNet data.""" if self._db is not None: return self._db full_filename = self.full_filename(DANNET_SQLITE_FILENAME) self.logger.info( 'Trying to read database file {}'.format(full_filename)) try: self._db = DB(filename=full_filename, dbtype='sqlite') if not hasattr(self._db.tables, 'words'): self.logger.debug('Database is empty') # There is no content in the database raise Exception('Not initialized') except: self.build_sqlite_database() self._db = DB(filename=full_filename, dbtype='sqlite') return self._db def download(self, filename=DANNET_FILENAME, redownload=False): """Download data.""" local_filename = join(self.data_directory(), filename) if not redownload and isfile(local_filename): message = 'Not downloading as corpus already download to {}' self.logger.debug(message.format(local_filename)) return self.make_data_directory() url = BASE_URL + filename self.logger.info('Downloading from URL {} to {}'.format( url, local_filename)) response = requests.get(url, stream=True) with open(local_filename, 'wb') as fid: copyfileobj(response.raw, fid) self.logger.debug('Corpus downloaded'.format()) def full_filename(self, filename=DANNET_FILENAME): """Prepend data directory path to filename. Parameters ---------- filename : str Filename of local Dannet file. Returns ------- full_filename : str Filename with full directory path information. """ if sep in filename: return filename else: return join(data_directory(), 'dannet', filename) def glossary(self, word): """Return glossary for word. Parameters ---------- word : str Query word. Returns ------- glossary : list of str List of distinct strings from `gloss` field of synsets which form matches the query word. Examples -------- >>> dannet = Dannet() >>> len(dannet.glossary('virksomhed')) == 3 True """ query_template = u(""" SELECT DISTINCT s.gloss FROM synsets s, wordsenses ws, words w WHERE s.synset_id = ws.synset_id AND ws.word_id = w.word_id AND w.form = '{word}';""") query = query_template.format( word=word.replace('\\', '\\\\').replace("'", "\\'")) self.logger.debug( u('Querying with {}').format(query.replace('\n', ' '))) glossary = list(self.db.query(query).gloss) return glossary def iter_sentences(self): """Iterate over sentences in the synsets examples. The synsets definitions have examples of word usages. There might be several examples for some synsets. This function iterates over all the sentences. Yields ------ sentence : str Sentence. """ use_pattern = re.compile(r'\(Brug: (".+?")\)', flags=re.UNICODE) quote_pattern = re.compile(r'"(.+?)"(?:; "(.+?)")*', flags=re.UNICODE) synsets = self.read_synsets() self.logger.debug('Iterating over sentences') for gloss in synsets.gloss: use_matches = use_pattern.findall(gloss) if use_matches: quote_matches = quote_pattern.findall(use_matches[0]) for parts in quote_matches[0]: sentences = parts.split(' || ') for sentence in sentences: if sentence: yield sentence.replace('[', '').replace(']', '') def iter_sentence_words(self, lower=True, stem=False): """Yield list of words from sentences. Parameters ---------- lower : bool, default True Lower case the words. stem : bool, default False Apply word stemming. DanishStemmer from nltk is used. Yields ------ words : list of str List of words """ for sentence in self.iter_sentences(): words = self.word_tokenizer.tokenize(sentence) if lower: words = [word.lower() for word in words] if stem: words = [self.stemmer.stem(word) for word in words] yield words def read_zipped_csv_file(self, filename, zip_filename=DANNET_FILENAME): """Read a zipped csv DanNet file. The csv file is read with the 'latin_1' encoding. Parameters ---------- filename : str Filename of the file within the zip file. zip_filename : str Filename of the zip file. This is expanded as it expect the data to be in the data directory. Returns ------- df : pandas.DataFrame Dataframe with the data from the csv file. """ full_zip_filename = self.full_filename(zip_filename) if not isfile(full_zip_filename): self.logger.info('File {} not downloaded'.format(zip_filename)) self.download() full_filename = join(splitext(zip_filename)[0], filename) self.logger.info('Reading from {}'.format(full_zip_filename)) zip_file = ZipFile(full_zip_filename) try: df = read_csv(zip_file.open(full_filename), sep='@', encoding='latin_1', header=None) except CParserError: self.logger.debug('Reading of csv with Pandas failed') # Bad csv file with unquoted "@" in line 19458 and 45686 # in synsets.csv with zip_file.open(full_filename) as fid: # Major problem with getting Python2/3 compatibility if version_info[0] == 2: csv_file = csv.reader(fid, delimiter='@') rows = [] for row in csv_file: if len(row) == 6: row = [ row[0], row[1], row[2] + '@' + row[3], row[4], row[5] ] row = [elem.decode('latin_1') for elem in row] rows.append(row) else: # Encoding problem handle with # https://stackoverflow.com/questions/36971345 lines = (line.decode('latin_1') for line in fid) csv_file = csv.reader(lines, delimiter='@') rows = [] for row in csv_file: if len(row) == 6: row = [ row[0], row[1], row[2] + '@' + row[3], row[4], row[5] ] rows.append(row) df = DataFrame(rows) # Drop last column which always seems to be superfluous df = df.iloc[:, :-1] self.logger.debug('Read {}x{} data from csv'.format(*df.shape)) return df def make_data_directory(self): """Make data directory for LCC.""" make_data_directory(self.data_directory()) def read_relations(self, zip_filename=DANNET_FILENAME): """Read relations CSV file. Returns ------- df : pandas.DataFrame Dataframe with columns synset_id, name, name2, value, taxonomic, inheritance_comment. """ df = self.read_zipped_csv_file('relations.csv', zip_filename=zip_filename) df.columns = [ 'synset_id', 'name', 'name2', 'value', 'taxonomic', 'inheritance_comment' ] return df def read_synset_attributes(self, zip_filename=DANNET_FILENAME): """Read synset attributes CSV file. Parameters ---------- zip_filename : str Filename for the zip file with the CSV file. Returns ------- df : pandas.DataFrame Dataframe with columns synset_id, type and value. """ df = self.read_zipped_csv_file('synset_attributes.csv', zip_filename=zip_filename) df.columns = ['synset_id', 'type', 'value'] return df def read_synsets(self, zip_filename=DANNET_FILENAME): """Read synsets CSV file. Returns ------- df : pandas.DataFrame Dataframe with columns id, label, gloss, ontological_type. Examples -------- >>> dannet = Dannet() >>> df = dannet.read_synsets() >>> 'label' in df.columns True """ df = self.read_zipped_csv_file('synsets.csv', zip_filename=zip_filename) # import pdb # pdb.set_trace() df.columns = ['synset_id', 'label', 'gloss', 'ontological_type'] return df def read_words(self, zip_filename=DANNET_FILENAME): """Read words from CSV file. Returns ------- df : pandas.DataFrame Dataframe with id, form and pos columns. """ df = self.read_zipped_csv_file('words.csv', zip_filename=zip_filename) df.columns = ['word_id', 'form', 'pos'] return df def read_wordsenses(self, zip_filename=DANNET_FILENAME): """Read wordsenses data file. Returns ------- df : pandas.DataFrame Dataframe with the columns wordsense_id, word_id, synset_id and register. """ df = self.read_zipped_csv_file('wordsenses.csv', zip_filename=zip_filename) df.columns = ['wordsense_id', 'word_id', 'synset_id', 'register'] return df def build_sqlite_database(self, filename=DANNET_SQLITE_FILENAME, zip_filename=DANNET_FILENAME, if_exists='replace'): """Build SQLite database with DanNet data. This function will read the comma-separated values files and add the information to a SQLite database stored in the data directory under dannet. Execution of this function will typically take a couple of seconds. Parameters ---------- filename : str, optional Filename of the SQLite file. zip_filename : str, optional Filename of CSV file. if_exists : bool, optional Determines whether the database tables should be overwritten (replace) [default: replace] """ tables = [('relations', self.read_relations), ('synset_attributes', self.read_synset_attributes), ('synsets', self.read_synsets), ('words', self.read_words), ('wordsenses', self.read_wordsenses)] full_filename = self.full_filename(filename) self.logger.info('Building "{full_filename}" sqlite file'.format( full_filename=full_filename)) with sqlite3.connect(full_filename) as connection: for table, method in tables: df = method(zip_filename=zip_filename) self.logger.info('Writing "{table}" table'.format(table=table)) df.to_sql(table, con=connection, if_exists=if_exists)
from flask import Flask, request, render_template from sklearn.externals import joblib import nltk from nltk.corpus import stopwords from nltk.stem.snowball import DanishStemmer nltk.download('stopwords') stopwords = stopwords.words('danish') stemmer = DanishStemmer() def text_process(name): """ Tekstprocessering som laver om til små bogstaver, fjerner stopord og finder ordstammen """ lst = name.lower().split(' ') stop = [word for word in lst if word not in stopwords] stem = [stemmer.stem(word) for word in stop] return stem pipeline = joblib.load('model/predict_business.pkl') def predict_business(name): return pipeline.predict([name])[0] app = Flask(__name__)
""" NER for sermons in content.dat """ import os import pandas as pd import numpy as np from polyglot.text import Text import nltk.data from nltk.stem.snowball import DanishStemmer stemmer = DanishStemmer() if __name__ == "__main__": """ First processing to create NER outputs """ df = pd.read_csv(os.path.join("data", "content", "content.dat", encoding='utf-8', header = 0, index_col = None)) content = df["content"].tolist() fnames = df["id"].tolist() tokenizer = nltk.data.load(os.path.join("tokenizers", "punkt", "norwegian.pickle")) entity_list = [] i = 0 for i, text in enumerate(content): #for i, text in enumerate(content[:4]): print(f"file {i}") # sentence disambiguation sents = tokenizer.tokenize(text)