def word_normalize(s: str, method: str = "l") -> str:
    """
    Splits a string and lemmatizes every single word, except acronyms
    """
    if method not in ["s", "l"]:
        raise ValueError("Method must be either 's' or 'l' for either"
                         "stemming or lemmatizing")

    # TODO: change this to match language
    lemmatizer = lemmy.load("da")
    stemmer = DanishStemmer()

    words = s.split(" ")

    norm_words = []
    for w in words:
        if w.isupper():
            norm_words.append(w)
        else:
            if method == "l":
                w = lemmatizer.lemmatize("", w)
                norm_words.extend(w)
            else:
                w = stemmer.stem(w)
                norm_words.append(w)

    return " ".join(norm_words)
Esempio n. 2
0
def tokenize_and_stem(text):
    ## check if eng or dk
    filtered_text = re.sub('[^\w\-\'/]', ' ', text)
    lang = detect(filtered_text)
    if lang in valid_languages:
        tokens = filtered_text.lower().split(" ")
        processed = []

        stopwords = []
        if lang == "da":
            snowball = DanishStemmer()
            stopwords = dk_stop
        if lang == "en":
            snowball = EnglishStemmer()
            stopwords = eng_stop

        for token in tokens:
            if token in stopwords:
                continue
            elif "\n" in token:
                continue
            elif "\\n" in token:
                continue
            elif token == "":
                continue
            elif token.isdigit():
                continue
            else:
                processed.append(token)
        stemmed = []
        for token in processed:
            stemmed.append(snowball.stem(token))
        return stemmed
Esempio n. 3
0
    def __init__(self):
        """Set up tokenizer."""
        self.logger = logging.getLogger(__name__ + '.Corpus')
        self.logger.addHandler(logging.NullHandler())

        self.logger.debug('Setup word tokenizer')
        self.word_tokenizer = WordPunctTokenizer()

        self.logger.debug('Setup stemmer')
        self.stemmer = DanishStemmer()
Esempio n. 4
0
    def __init__(self):
        """Initialize logger and and database."""
        self.logger = logging.getLogger(__name__ + '.Dannet')
        self.logger.addHandler(logging.NullHandler())

        self.logger.debug('Initializing tokenizer and stemmer')
        self.word_tokenizer = WordPunctTokenizer()
        self.stemmer = DanishStemmer()

        self._db = None
Esempio n. 5
0
    def __init__(self):
        """Set up data directory and other attributes."""
        self.logger = logging.getLogger('dasem.gutenberg.Gutenberg')
        self.logger.addHandler(logging.NullHandler())

        self.data_directory = join(data_directory(), 'gutenberg',
                                   'aleph.gutenberg.org')
        self.sentence_tokenizer = nltk.data.load(
            'tokenizers/punkt/danish.pickle')
        self.whitespaces_pattern = re.compile('\s+',
                                              flags=re.DOTALL | re.UNICODE)
        self.word_tokenizer = WordPunctTokenizer()
        self.stemmer = DanishStemmer()
Esempio n. 6
0
def token(X,
          words_only=False,
          word_normalize=True,
          emoji_normalize=True,
          remove_digits=True,
          lower_case=True,
          stop_words=None):
    '''
        requires Stemming if word_normalize = True
        '''

    # eyes [nose] mouth | mouth [nose] eyes pattern
    emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)"
    emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE)

    # Keep word only. Digit are consider true Emojis false
    if words_only:
        clean_text = re.sub('[\W]+', ' ', X)
    else:
        clean_text = '{}{}'.format(re.sub('[\W]+', ' ', X),
                                   ''.join(re.findall(emoticon_re, X)))

    # normalize emoji?
    if emoji_normalize:

        clean_text = (re.sub('[\W]+', ' ', X) + ' '.join(
            re.findall(emoticon_re, X)).replace(';', ':').replace('-', ''))

    if remove_digits:
        clean_text = clean_text.translate(str.maketrans('', '', '0123456789'))

    if lower_case:
        clean_text = clean_text.lower()

    if word_normalize:
        stemmer = DanishStemmer()

        clean_text = ' '.join(
            stemmer.stem(word) for word in clean_text.split())

    if stop_words:

        return [word for word in clean_text.split() if word not in stop_words]
    else:
        return clean_text.split()
Esempio n. 7
0
class Corpus(with_metaclass(ABCMeta)):
    """Abstract class for corpus."""
    def __init__(self):
        """Set up tokenizer."""
        self.logger = logging.getLogger(__name__ + '.Corpus')
        self.logger.addHandler(logging.NullHandler())

        self.logger.debug('Setup word tokenizer')
        self.word_tokenizer = WordPunctTokenizer()

        self.logger.debug('Setup stemmer')
        self.stemmer = DanishStemmer()

    def iter_sentence_words(self, lower=True, stem=False):
        """Yield list of words from sentences.

        Parameters
        ----------
        lower : bool, default True
            Lower case the words.
        stem : bool, default False
            Apply word stemming. DanishStemmer from nltk is used.

        Yields
        ------
        words : list of str
            List of words

        """
        for sentence in self.iter_sentences():
            words = self.word_tokenizer.tokenize(sentence)
            if lower:
                words = [word.lower() for word in words]
            if stem:
                words = [self.stemmer.stem(word) for word in words]

            yield words

    def iter_tokenized_sentences(self, lower=True, stem=False):
        """Yield string with tokenized sentences.

        Parameters
        ----------
        lower : bool, default True
            Lower case the words.
        stem : bool, default False
            Apply word stemming. DanishStemmer from nltk is used.

        Yields
        ------
        tokenized_sentence : str
            Sentence as string with tokens separated by a whitespace.

        """
        for words in self.iter_sentence_words(lower=lower, stem=stem):
            tokenized_sentence = u(" ").join(words)
            yield tokenized_sentence
Esempio n. 8
0
    def __init__(self,
                 danish_filename=DANISH_FILENAME,
                 tar_gz_filename=TGZ_PARALLEL_CORPUS_FILENAME):
        """Set up filename.

        Parameters
        ----------
        danish_filename : str
            Filename for '.da' file in the tar.gz file.
        tar_gz_filename : str
            Filename for tar.gz or tgz file with Danish/English.

        """
        self.logger = logging.getLogger(__name__ + '.Europarl')
        self.logger.addHandler(logging.NullHandler())

        self.tar_gz_filename = tar_gz_filename
        self.danish_filename = danish_filename

        self.word_tokenizer = WordPunctTokenizer()
        self.stemmer = DanishStemmer()
Esempio n. 9
0
class StoLemmatizer(object):

    def __init__(self):
        self._read_sto_mapping()
        self._read_sto_words()
        self._stemmer = DanishStemmer()


    def _read_sto_mapping(self):
        self.sto_to_uni = {}
        with codecs.open(os.path.join(__location__, "da-sto.map"), encoding='utf-8') as f:
            for line in f:
                sto, uni = line.strip().split("\t")
                self.sto_to_uni[sto] = uni

    def _read_sto_words(self):
        self.lookup_form_and_pos = {}
        self.lookup_form = {}

        with codecs.open(os.path.join(__location__, "STOposUTF8.txt"), encoding='utf-8') as f:
            for line in f:
                form, lemma, pos = line.strip().split("\t")
                self.lookup_form_and_pos[(form.lower(), self.sto_to_uni[pos])] = lemma.lower()
                self.lookup_form_and_pos[(form.lower(), None)] = lemma.lower()

                self.lookup_form[form.lower()] = lemma.lower()

    def lemmatize(self, form, pos=None):
        """
        Look-up word form with optional part-of-speech (17- UD tagset).

        The method implements a fall-back strategy. When a match with the correct
         part of speech cannot be found, it tries to match the word form with any part of speech.
         If this also fails, the word is stemmed (using the Snowball stemmer) instead of lemmatized.

        :param form:
        :param pos:
        :return:
        """

        form = form.lower()
        if pos in ('NUM', 'PUNCT', 'X', 'INTJ', 'SYM', 'PROPN'):
            return form

        if pos == "AUX":
            pos = "VERB"

        return self.lookup_form_and_pos.get((form, pos)) \
               or self.lookup_form_and_pos.get((form, None)) \
               or self._stemmer.stem(form)
Esempio n. 10
0
def preprocess_text(text):
    # text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = str(text).lower().strip()

    # caveat: this might conflict with the english text
    da_stop_words = stopwords.words('danish')
    stemmer = DanishStemmer()
    lemmatizer = lemmy.load("da")

    # remove plurals
    textblob = TextBlob(text)
    singles = [stemmer.stem(word) for word in textblob.words]

    # remove danish stopwords
    no_stop_words = [word for word in singles if word not in da_stop_words]

    # join text so it can be lemmatized
    joined_text = " ".join(no_stop_words)

    # lemmatization
    final_text = lemmatizer.lemmatize("", joined_text)

    return final_text[0]
Esempio n. 11
0
    def convert_text_to_rouge_format(text, title="dummy title"):
        """
        Convert a text to a format ROUGE understands. The text is
        assumed to contain one sentence per line.

            text:   The text to convert, containg one sentence per line.
            title:  Optional title for the text. The title will appear
                    in the converted file, but doesn't seem to have
                    any other relevance.

        Returns: The converted text as string.

        """
        # sentences = text.split("\n")
        from nltk.stem.snowball import DanishStemmer
        stemmer = DanishStemmer()
        sentences = text.split("<q>")
        output = []
        for sentence in sentences:
            output.append(" ".join([stemmer.stem(i)
                                    for i in sentence.split()]))
        sent_elems = [
            "<a name=\"{i}\">[{i}]</a> <a href=\"#{i}\" id={i}>"
            "{text}</a>".format(i=i, text=sent)
            for i, sent in enumerate(output, start=1)
        ]
        html = """<html>
<head>
<title>{title}</title>
</head>
<body bgcolor="white">
{elems}
</body>
</html>""".format(title=title, elems="\n".join(sent_elems))

        return html
Esempio n. 12
0
class StoLemmatizer(object):
    def __init__(self):
        self._read_sto_mapping()
        self._read_sto_words()
        self._stemmer = DanishStemmer()

    def _read_sto_mapping(self):
        self.sto_to_uni = {}
        with codecs.open(os.path.join(__location__, "da-sto.map"),
                         encoding='utf-8') as f:
            for line in f:
                sto, uni = line.strip().split("\t")
                self.sto_to_uni[sto] = uni

    def _read_sto_words(self):
        self.lookup_form_and_pos = {}
        self.lookup_form = {}

        with codecs.open(os.path.join(__location__, "STOposUTF8.txt"),
                         encoding='utf-8') as f:
            for line in f:
                form, lemma, pos = line.strip().split("\t")
                self.lookup_form_and_pos[(
                    form.lower(), self.sto_to_uni[pos])] = lemma.lower()
                self.lookup_form_and_pos[(form.lower(), None)] = lemma.lower()

                self.lookup_form[form.lower()] = lemma.lower()

    def lemmatize(self, form, pos=None):
        """
        Look-up word form with optional part-of-speech (universal tagset).

        The method implements a fall-back strategy. When a match with the correct
         part of speech cannot be found, it tries to match the word form with any part of speech.
         If this also fails, the word is stemmed (using the Snowball stemmer) instead of lemmatized.

        :param form:
        :param pos:
        :return:
        """

        form = form.lower()
        if pos in ('NUM', '.', 'X'):
            return form

        return self.lookup_form_and_pos.get((form, pos)) \
               or self.lookup_form_and_pos.get((form, None)) \
               or self._stemmer.stem(form)
Esempio n. 13
0
    def __init__(self, filename=BZ2_XML_DUMP_FILENAME):
        """Prepare dump file for reading.

        Parameters
        ----------
        filename : str
            Filename or the XML dump file.

        """
        self.logger = logging.getLogger(__name__)
        self.logger.addHandler(logging.NullHandler())

        full_filename = self.full_filename(filename)
        self.filename = full_filename

        self.sentence_tokenizer = nltk.data.load(
            'tokenizers/punkt/danish.pickle')
        self.whitespaces_pattern = re.compile(
            '\s+', flags=re.DOTALL | re.UNICODE)
        self.word_tokenizer = WordPunctTokenizer()
        self.stemmer = DanishStemmer()

        self.word_pattern = re.compile(
            r"""{{.+?}}|
            <!--.+?-->|
            \[\[Fil.+?\]\]|
            \[\[Kategori:.+?\]\]|
            \[http.+?\]|(\w+(?:-\w+)*)""",
            flags=re.UNICODE | re.VERBOSE | re.DOTALL)
        self.paragraph_split_pattern = re.compile(
            r'\n\s*\n', flags=re.DOTALL | re.UNICODE)
        self.ignored_words_pattern = re.compile(
            r"""
            (?:(?:thumb|thumbnail|left|right|\d+px|upright(?:=[0-9\.]+)?)\|)+
            |^\s*\|.+$
            |^REDIRECT\b""",
            flags=re.DOTALL | re.UNICODE | re.VERBOSE | re.MULTILINE)
        self.itemized_split_pattern = re.compile(
            r"^ |^Kategori:",
            flags=re.DOTALL | re.UNICODE | re.MULTILINE)
Esempio n. 14
0
 def __init__(self, lib_path=THIRD_PATY_PATH):
     """Constructor. Initialize class attributes.
     """
     self.word2idx = {}
     self.idx2word = {}
     self.words = []
     self.raw_data = []
     self.seq_data = None
     self.count = None
     self.word_ctx = defaultdict(set)
     self.pw_data = []
     self.docs = defaultdict(dict)
     self.lines = defaultdict(list)
     self.re_rules = []
     self.lib_path = lib_path
     self.ner_tagger = self.set_ner_tagger(NER_MODEL, NER_JAR)
     self.pos_tagger = self.set_pos_tagger(POS_MODEL, POS_JAR)
     self.pt_stemmer = PorterStemmer()
     self.dan_stemmer = DanishStemmer()
     self.lemma = WordNetLemmatizer()
     self.tfidf = TfidfVectorizer(tokenizer=self.tokenize,
                                  stop_words='english')
Esempio n. 15
0
class Gutenberg(Corpus):
    """Gutenberg.

    Interface to Gutenberg.

    The data will be mirrored/downloaded to a directory like:

        ~/dasem_data/gutenberg/aleph.gutenberg.org

    In regard to encoding of the Project Gutenberg texts: For instance,
    10218 is encoded in "ISO Latin-1". This is stated with the line
    "Character set encoding: ISO Latin-1" in the header of the data file.

    Attributes
    ----------
    data_directory : str
        Top directory where the text are mirrored.
    logger : logging.Logger
        Logging object.
    stemmer : object with stem method
        Object with stem method corresponding to
        nltk.stem.snowball.DanishStemmer.
    sentence_tokenizer . object with tokenize method
        Object with tokenize method for tokenizing a text into sentences.
    whitespaces_pattern : regex pattern
        Regular expression pattern.
    word_tokenizer : object with tokenize method
        Object with tokenize method, corresponding to nltk.WordPunctTokenizer.

    """
    def __init__(self):
        """Set up data directory and other attributes."""
        self.logger = logging.getLogger('dasem.gutenberg.Gutenberg')
        self.logger.addHandler(logging.NullHandler())

        self.data_directory = join(data_directory(), 'gutenberg',
                                   'aleph.gutenberg.org')
        self.sentence_tokenizer = nltk.data.load(
            'tokenizers/punkt/danish.pickle')
        self.whitespaces_pattern = re.compile('\s+',
                                              flags=re.DOTALL | re.UNICODE)
        self.word_tokenizer = WordPunctTokenizer()
        self.stemmer = DanishStemmer()

    def data_directory(self):
        """Return diretory where data should be.

        Returns
        -------
        dir : str
            Directory.

        """
        dir = join(data_directory(), 'gutenberg')
        return dir

    def download(self, redownload=False):
        r"""Download corpus from Gutenberg homepage.

        This method will use the external 'wget' program that is the only
        download method the Project Gutenberg allows. This is explained on
        their homepage. The command is:

        wget -w 2 -m -H \
          "http://www.gutenberg.org/robot/harvest?filetypes[]=txt&langs[]=da"

        This method will spawn a subprocess. The 'wget' program needs to be
        installed.

        Parameters
        ----------
        redownload : bool, optional
            If True will attempt to download anew. Otherwise, the method
            tests whether a specific file exists on the local data directory.
            If the file exists, then no files are fetch from Gutenberg.

        References
        ----------
        https://www.gutenberg.org/wiki/Gutenberg%3aInformation_About_Robot_Access_to_our_Pages

        """
        self.make_data_directory()

        test_filename = join(self.data_directory, '1', '0', '2', '1', '10218',
                             '10218-8.zip')
        if not redownload and isfile(test_filename):
            message = 'Not downloading as the file {} exists'
            self.logger.debug(message.format(test_filename))
            return

        directory = split(self.data_directory)[0]
        self.logger.info(
            'Downloading Danish Gutenberg corpus to {}'.format(directory))
        call(['wget', '-w', '2', '-m', '-H', DOWNLOAD_URL], cwd=directory)
        self.logger.debug('Gutenberg corpus downloaded')

    def make_data_directory(self):
        """Make data directory for LCC."""
        make_data_directory(data_directory(), 'gutenberg')

    def translate_aa(self, text):
        """Translate double-a to 'bolle-aa'.

        Parameters
        ----------
        text : str
            Input text to be translated.

        Returns
        -------
        translated_text : str
            Text with double-a translated to bolle-aa.

        """
        return text.replace('aa', u('\xe5')).replace('Aa', u('\xc5')).replace(
            'AA', u('\xc5'))

    def translate_whitespaces(self, text):
        r"""Translate multiple whitespaces to a single space.

        Parameters
        ----------
        text : str
            Input string to be translated.

        Returns
        -------
        translated_text : str
            String with multiple whitespaces translated to a single whitespace.

        Examples
        --------
        >>> gutenberg = Gutenberg()
        >>> gutenberg.translate_whitespaces('\n Hello \n  World \n')
        ' Hello World '

        """
        translated_text = self.whitespaces_pattern.sub(' ', text)
        return translated_text

    def get_all_ids(self):
        """Get all Gutenberg text ids from mirrored data.

        Returns
        -------
        ids : list of str
            List of Gutenberg ebook identifiers.

        Examples
        --------
        >>> gutenberg = Gutenberg()
        >>> '38080' in gutenberg.get_all_ids()
        True

        """
        ids = []
        for root, dirs, files in walk(self.data_directory):
            for file in files:
                if file.endswith('-8.zip'):
                    ids.append(file[:-6])
        return ids

    def get_text_by_id(self, id, extract_body=True):
        """Get text from mirrored Gutenberg archive.

        This function requires that the texts have been mirrored.

        Parameters
        ----------
        id : str or integer
            Gutenberg ebook identifier.
        extract_body : bool, default True
            Extract the body of the downloaded/mirrored Gutenberg raw text.

        Returns
        -------
        text : str
            Extracted text. The text is converted to Unicode.

        """
        # Example on subdirectory structure:
        # www.gutenberg.lib.md.us/4/4/9/6/44967
        s = str(id)
        l = list(s)
        if len(l) > 4:
            directory = join(self.data_directory, l[0], l[1], l[2], l[3], s)
        else:
            # For instance, id=9264 has only four-level subdirectories.
            # This might be because it is only 4 characters long
            directory = join(self.data_directory, l[0], l[1], l[2], s)

        zip_filename = join(directory, s + '-8.zip')
        self.logger.debug('Reading text from {}'.format(zip_filename))
        with ZipFile(zip_filename) as zip_file:
            filename = join(s, s + '-8.txt')
            try:
                with zip_file.open(filename) as f:
                    encoded_text = f.read()
            except KeyError:
                # There might be zip files where the data file is in the root
                filename = s + '-8.txt'
                with zip_file.open(filename) as f:
                    encoded_text = f.read()

        if encoded_text.find(b('Character set encoding: ISO-8859-1')) != -1:
            text = encoded_text.decode('ISO-8859-1')
        elif encoded_text.find(b('Character set encoding: ISO Latin-1')) != -1:
            text = encoded_text.decode('Latin-1')
        else:
            raise LookupError('Unknown encoding for file {}'.format(filename))

        if extract_body:
            extracted_text = extract_text(text)
            return extracted_text
        else:
            return text

    def iter_sentence_words(self,
                            translate_aa=True,
                            translate_whitespaces=True,
                            lower=True,
                            stem=False):
        """Yield list of words from sentences.

        Parameters
        ----------
        translate_aa : bool, default True
            Translate double-a to 'bolle-aa'.
        translate_whitespaces : bool, default True
            Translate multiple whitespaces to single whitespaces
        lower : bool, default True
            Lower case the words.
        stem : bool, default False
            Apply word stemming. DanishStemmer from nltk is used.

        Yields
        ------
        words : list of str
            List of words

        """
        for sentence in self.iter_sentences(
                translate_aa=translate_aa,
                translate_whitespaces=translate_whitespaces):
            words = self.word_tokenizer.tokenize(sentence)
            if lower:
                words = [word.lower() for word in words]
            if stem:
                words = [self.stemmer.stem(word) for word in words]

            yield words

    def iter_sentences(self, translate_aa=True, translate_whitespaces=True):
        """Yield sentences.

        The method uses the NLTK Danish sentence tokenizer.

        Parameters
        ----------
        translate_aa : bool, default True
            Translate double-aa to bolle-aa.
        translate_whitespaces : book, default True
            Translate multiple whitespaces to a single space.

        Yields
        ------
        sentence : str
            String with sentences.

        Examples
        --------
        >>> gutenberg = Gutenberg()
        >>> found = False
        >>> for sentence in gutenberg.iter_sentences():
        ...     if 'Indholdsfortegnelse.' == sentence:
        ...         found = True
        ...         break
        >>> found
        True

        """
        for text in self.iter_texts(translate_aa=translate_aa):
            sentences = self.sentence_tokenizer.tokenize(text)
            for sentence in sentences:
                if translate_whitespaces:
                    sentence = self.translate_whitespaces(sentence)
                yield sentence

    def iter_texts(self, translate_aa=True):
        """Yield texts.

        Parameters
        ----------
        translate_aa : bool, default True
            Translate double-aa to bolle-aa.

        Yields
        ------
        text : str
            Text.

        """
        for id in self.get_all_ids():
            text = self.get_text_by_id(id)
            if translate_aa:
                yield self.translate_aa(text)
            else:
                yield text
Esempio n. 16
0
class Europarl(Corpus):
    """Europarl corpus.

    Examples
    --------
    >>> europarl = Europarl()
    >>> sentence = next(europarl.iter_tokenized_sentences())
    >>> "sessionen" in sentence.split()
    True

    """
    def __init__(self,
                 danish_filename=DANISH_FILENAME,
                 tar_gz_filename=TGZ_PARALLEL_CORPUS_FILENAME):
        """Set up filename.

        Parameters
        ----------
        danish_filename : str
            Filename for '.da' file in the tar.gz file.
        tar_gz_filename : str
            Filename for tar.gz or tgz file with Danish/English.

        """
        self.logger = logging.getLogger(__name__ + '.Europarl')
        self.logger.addHandler(logging.NullHandler())

        self.tar_gz_filename = tar_gz_filename
        self.danish_filename = danish_filename

        self.word_tokenizer = WordPunctTokenizer()
        self.stemmer = DanishStemmer()

    def data_directory(self):
        """Return diretory where data should be.

        Returns
        -------
        directory : str
            Directory.

        """
        directory = join(data_directory(), 'europarl')
        return directory

    def download(self, redownload=False):
        """Download corpus."""
        filename = TGZ_PARALLEL_CORPUS_FILENAME
        local_filename = join(self.data_directory(), filename)
        if not redownload and isfile(local_filename):
            message = 'Not downloading as corpus already download to {}'
            self.logger.debug(message.format(local_filename))
            return

        self.make_data_directory()
        url = TGZ_PARALLEL_CORPUS_URL
        self.logger.info('Downloading {} to {}'.format(url, local_filename))
        response = requests.get(url, stream=True)
        with open(local_filename, 'wb') as fid:
            copyfileobj(response.raw, fid)
        self.logger.debug('Corpus downloaded'.format())

    def iter_sentences(self):
        """Yield sentences.

        Yields
        ------
        sentence : str
            Sentences as Unicode strings.

        """
        full_tar_gz_filename = join(self.data_directory(),
                                    self.tar_gz_filename)
        with tarfile.open(full_tar_gz_filename, "r:gz") as tar:
            fid = tar.extractfile(self.danish_filename)
            for line in fid:
                yield line.decode('utf-8').strip()

    def iter_sentence_words(self, lower=True, stem=False):
        """Yield list of words from sentences.

        Parameters
        ----------
        lower : bool, default True
            Lower case the words.
        stem : bool, default False
            Apply word stemming. DanishStemmer from nltk is used.

        Yields
        ------
        words : list of str
            List of words

        """
        for sentence in self.iter_sentences():
            words = self.word_tokenizer.tokenize(sentence)
            if lower:
                words = [word.lower() for word in words]
            if stem:
                words = [self.stemmer.stem(word) for word in words]

            yield words

    def make_data_directory(self):
        """Make data directory for Europarl."""
        make_data_directory(self.data_directory())
Esempio n. 17
0
 def __init__(self):
     self._read_sto_mapping()
     self._read_sto_words()
     self._stemmer = DanishStemmer()
def snowball(words):
    s = DanishStemmer()
    return [s.stem(w) for w in words]
Esempio n. 19
0
 def __init__(self):
     self._read_sto_mapping()
     self._read_sto_words()
     self._stemmer = DanishStemmer()
Esempio n. 20
0
def stem_lem(words, documents, stem_or_lem: bool = False):
    """
    Updates a word list and a corpus to use stemmed words.
    :param stem_or_lem: bool indicating whether to apply stemming or lemmatizer. True is stem, False is lem.
    :param corpus: a list of sentences (strings of words separated by spaces)
    :param words: a list of words
    :return: new corpus and words list, were all words have been replaced by stemmed/lemmetized versions.
    """
    stop_words = stopwords.words('danish')
    stop_words.extend(
        list(utility.load_vector_file("word_datasets/stopwords.csv").values()))
    if stem_or_lem:
        # Stemming
        stemmer = DanishStemmer()
        # Update word list to use stemmed words
        translator = {}
        add = []
        remove = []
        for word in tqdm(words):
            stem = stemmer.stem(word)
            if stem != word:
                if word not in remove:
                    remove.append(word)
                if stem not in add and stem not in stop_words:
                    add.append(stem)
                if word not in translator and stem not in stop_words:
                    translator[word] = stem
        words = [x for x in words if x not in remove]
        words.extend([x for x in add if x not in words])
    else:
        lemmer = lemmy.load("da")
        # build up dictionary that translates old words into their new versions
        translator = {}
        add = []
        remove = []
        for word in tqdm(words):
            lem = lemmer.lemmatize("", word)
            other = [x for x in lem if x != word]
            if len(other) > 0:
                if word not in lem and word not in remove:
                    remove.append(word)
                # add all lem options if they are not stopwords
                add.extend(
                    [x for x in lem if x not in stop_words and x not in add])
                if word not in translator and lem not in stop_words:
                    lem = " ".join(lem)
                    translator[word] = lem
        words = [x for x in words if x not in remove]
        words.extend([x for x in add if x not in words])

    # update corpus to use stemmed words
    for x in tqdm(range(len(documents))):
        sentence = documents[x]
        for i in range(len(sentence)):
            word = sentence[i]
            if word in translator:
                sentence[i] = translator[word]
        sentence = ' '.join(sentence)
        sentence = sentence.split(' ')
        documents[x] = sentence

    diction = gensim.corpora.Dictionary(documents)
    d_words = diction.token2id
    good_ids = [d_words[x] for x in words]
    diction.filter_tokens(good_ids=good_ids)
    diction.compactify()

    return diction, documents
def snowball_single(w):
    s = DanishStemmer()
    return s.stem(w)
Esempio n. 22
0
class Dannet(Corpus, DataDirectoryMixin):
    """Dannet.

    Using the module will automagically download the data from the Dannet
    homepage (http://www.wordnet.dk).

    Attributes
    ----------
    db : db.DB
        Database access through the db.py interface.

    Examples
    --------
    >>> dannet = Dannet()
    >>> dannet.db.tables.words
    +---------------------------------------------------+
    |                       words                       |
    +---------+---------+--------------+----------------+
    | Column  | Type    | Foreign Keys | Reference Keys |
    +---------+---------+--------------+----------------+
    | index   | INTEGER |              |                |
    | word_id | TEXT    |              |                |
    | form    | TEXT    |              |                |
    | pos     | TEXT    |              |                |
    +---------+---------+--------------+----------------+

    >>> # From README
    >>> query = '''
    ... SELECT w.form, ws.register, s.synset_id, s.gloss, s.ontological_type
    ... FROM synsets s, wordsenses ws, words w
    ... WHERE s.synset_id = ws.synset_id
    ...   AND ws.word_id = w.word_id
    ...   AND w.form = 'spand';'''
    >>> 'bil' in dannet.db.query(query).gloss[0]
    True

    >>> # Danish nouns
    >>> dannet = Dannet()
    >>> query = "select w.form from words w where w.pos = 'Noun'"
    >>> nouns = set(dannet.db.query(query).form)
    >>> 'guitar' in nouns
    True
    >>> 'guitaren' in nouns
    False
    >>> len(nouns)
    48404

    References
    ----------
    - http://www.wordnet.dk

    """
    def __init__(self):
        """Initialize logger and and database."""
        self.logger = logging.getLogger(__name__ + '.Dannet')
        self.logger.addHandler(logging.NullHandler())

        self.logger.debug('Initializing tokenizer and stemmer')
        self.word_tokenizer = WordPunctTokenizer()
        self.stemmer = DanishStemmer()

        self._db = None

    @property
    def db(self):
        """Return a db.py instance with DanNet data."""
        if self._db is not None:
            return self._db

        full_filename = self.full_filename(DANNET_SQLITE_FILENAME)
        self.logger.info(
            'Trying to read database file {}'.format(full_filename))
        try:
            self._db = DB(filename=full_filename, dbtype='sqlite')
            if not hasattr(self._db.tables, 'words'):
                self.logger.debug('Database is empty')
                # There is no content in the database
                raise Exception('Not initialized')
        except:
            self.build_sqlite_database()
            self._db = DB(filename=full_filename, dbtype='sqlite')
        return self._db

    def download(self, filename=DANNET_FILENAME, redownload=False):
        """Download data."""
        local_filename = join(self.data_directory(), filename)
        if not redownload and isfile(local_filename):
            message = 'Not downloading as corpus already download to {}'
            self.logger.debug(message.format(local_filename))
            return

        self.make_data_directory()
        url = BASE_URL + filename
        self.logger.info('Downloading from URL {} to {}'.format(
            url, local_filename))
        response = requests.get(url, stream=True)
        with open(local_filename, 'wb') as fid:
            copyfileobj(response.raw, fid)
        self.logger.debug('Corpus downloaded'.format())

    def full_filename(self, filename=DANNET_FILENAME):
        """Prepend data directory path to filename.

        Parameters
        ----------
        filename : str
            Filename of local Dannet file.

        Returns
        -------
        full_filename : str
            Filename with full directory path information.

        """
        if sep in filename:
            return filename
        else:
            return join(data_directory(), 'dannet', filename)

    def glossary(self, word):
        """Return glossary for word.

        Parameters
        ----------
        word : str
            Query word.

        Returns
        -------
        glossary : list of str
            List of distinct strings from `gloss` field of synsets which
            form matches the query word.

        Examples
        --------
        >>> dannet = Dannet()
        >>> len(dannet.glossary('virksomhed')) == 3
        True

        """
        query_template = u("""
            SELECT DISTINCT s.gloss
            FROM synsets s, wordsenses ws, words w
            WHERE s.synset_id = ws.synset_id AND
                ws.word_id = w.word_id AND w.form = '{word}';""")
        query = query_template.format(
            word=word.replace('\\', '\\\\').replace("'", "\\'"))
        self.logger.debug(
            u('Querying with {}').format(query.replace('\n', ' ')))
        glossary = list(self.db.query(query).gloss)
        return glossary

    def iter_sentences(self):
        """Iterate over sentences in the synsets examples.

        The synsets definitions have examples of word usages. There might be
        several examples for some synsets. This function iterates over all the
        sentences.

        Yields
        ------
        sentence : str
            Sentence.

        """
        use_pattern = re.compile(r'\(Brug: (".+?")\)', flags=re.UNICODE)
        quote_pattern = re.compile(r'"(.+?)"(?:; "(.+?)")*', flags=re.UNICODE)
        synsets = self.read_synsets()
        self.logger.debug('Iterating over sentences')
        for gloss in synsets.gloss:
            use_matches = use_pattern.findall(gloss)
            if use_matches:
                quote_matches = quote_pattern.findall(use_matches[0])
                for parts in quote_matches[0]:
                    sentences = parts.split(' || ')
                    for sentence in sentences:
                        if sentence:
                            yield sentence.replace('[', '').replace(']', '')

    def iter_sentence_words(self, lower=True, stem=False):
        """Yield list of words from sentences.

        Parameters
        ----------
        lower : bool, default True
            Lower case the words.
        stem : bool, default False
            Apply word stemming. DanishStemmer from nltk is used.

        Yields
        ------
        words : list of str
            List of words

        """
        for sentence in self.iter_sentences():
            words = self.word_tokenizer.tokenize(sentence)
            if lower:
                words = [word.lower() for word in words]
            if stem:
                words = [self.stemmer.stem(word) for word in words]

            yield words

    def read_zipped_csv_file(self, filename, zip_filename=DANNET_FILENAME):
        """Read a zipped csv DanNet file.

        The csv file is read with the 'latin_1' encoding.

        Parameters
        ----------
        filename : str
            Filename of the file within the zip file.
        zip_filename : str
            Filename of the zip file. This is expanded as it expect the data
            to be in the data directory.

        Returns
        -------
        df : pandas.DataFrame
            Dataframe with the data from the csv file.

        """
        full_zip_filename = self.full_filename(zip_filename)

        if not isfile(full_zip_filename):
            self.logger.info('File {} not downloaded'.format(zip_filename))
            self.download()

        full_filename = join(splitext(zip_filename)[0], filename)

        self.logger.info('Reading from {}'.format(full_zip_filename))
        zip_file = ZipFile(full_zip_filename)
        try:
            df = read_csv(zip_file.open(full_filename),
                          sep='@',
                          encoding='latin_1',
                          header=None)
        except CParserError:
            self.logger.debug('Reading of csv with Pandas failed')
            # Bad csv file with unquoted "@" in line 19458 and 45686
            # in synsets.csv
            with zip_file.open(full_filename) as fid:
                # Major problem with getting Python2/3 compatibility
                if version_info[0] == 2:
                    csv_file = csv.reader(fid, delimiter='@')
                    rows = []
                    for row in csv_file:
                        if len(row) == 6:
                            row = [
                                row[0], row[1], row[2] + '@' + row[3], row[4],
                                row[5]
                            ]
                        row = [elem.decode('latin_1') for elem in row]
                        rows.append(row)
                else:
                    # Encoding problem handle with
                    # https://stackoverflow.com/questions/36971345
                    lines = (line.decode('latin_1') for line in fid)
                    csv_file = csv.reader(lines, delimiter='@')
                    rows = []
                    for row in csv_file:
                        if len(row) == 6:
                            row = [
                                row[0], row[1], row[2] + '@' + row[3], row[4],
                                row[5]
                            ]
                        rows.append(row)
            df = DataFrame(rows)

        # Drop last column which always seems to be superfluous
        df = df.iloc[:, :-1]
        self.logger.debug('Read {}x{} data from csv'.format(*df.shape))

        return df

    def make_data_directory(self):
        """Make data directory for LCC."""
        make_data_directory(self.data_directory())

    def read_relations(self, zip_filename=DANNET_FILENAME):
        """Read relations CSV file.

        Returns
        -------
        df : pandas.DataFrame
            Dataframe with columns synset_id, name, name2, value, taxonomic,
            inheritance_comment.

        """
        df = self.read_zipped_csv_file('relations.csv',
                                       zip_filename=zip_filename)
        df.columns = [
            'synset_id', 'name', 'name2', 'value', 'taxonomic',
            'inheritance_comment'
        ]
        return df

    def read_synset_attributes(self, zip_filename=DANNET_FILENAME):
        """Read synset attributes CSV file.

        Parameters
        ----------
        zip_filename : str
            Filename for the zip file with the CSV file.

        Returns
        -------
        df : pandas.DataFrame
            Dataframe with columns synset_id, type and value.

        """
        df = self.read_zipped_csv_file('synset_attributes.csv',
                                       zip_filename=zip_filename)
        df.columns = ['synset_id', 'type', 'value']
        return df

    def read_synsets(self, zip_filename=DANNET_FILENAME):
        """Read synsets CSV file.

        Returns
        -------
        df : pandas.DataFrame
            Dataframe with columns id, label, gloss, ontological_type.

        Examples
        --------
        >>> dannet = Dannet()
        >>> df = dannet.read_synsets()
        >>> 'label' in df.columns
        True

        """
        df = self.read_zipped_csv_file('synsets.csv',
                                       zip_filename=zip_filename)
        # import pdb
        # pdb.set_trace()
        df.columns = ['synset_id', 'label', 'gloss', 'ontological_type']
        return df

    def read_words(self, zip_filename=DANNET_FILENAME):
        """Read words from CSV file.

        Returns
        -------
        df : pandas.DataFrame
            Dataframe with id, form and pos columns.

        """
        df = self.read_zipped_csv_file('words.csv', zip_filename=zip_filename)
        df.columns = ['word_id', 'form', 'pos']
        return df

    def read_wordsenses(self, zip_filename=DANNET_FILENAME):
        """Read wordsenses data file.

        Returns
        -------
        df : pandas.DataFrame
           Dataframe with the columns wordsense_id, word_id, synset_id and
           register.

        """
        df = self.read_zipped_csv_file('wordsenses.csv',
                                       zip_filename=zip_filename)
        df.columns = ['wordsense_id', 'word_id', 'synset_id', 'register']
        return df

    def build_sqlite_database(self,
                              filename=DANNET_SQLITE_FILENAME,
                              zip_filename=DANNET_FILENAME,
                              if_exists='replace'):
        """Build SQLite database with DanNet data.

        This function will read the comma-separated values files and add the
        information to a SQLite database stored in the data directory under
        dannet.

        Execution of this function will typically take a couple of seconds.

        Parameters
        ----------
        filename : str, optional
            Filename of the SQLite file.
        zip_filename : str, optional
            Filename of CSV file.
        if_exists : bool, optional
            Determines whether the database tables should be overwritten
            (replace) [default: replace]

        """
        tables = [('relations', self.read_relations),
                  ('synset_attributes', self.read_synset_attributes),
                  ('synsets', self.read_synsets), ('words', self.read_words),
                  ('wordsenses', self.read_wordsenses)]

        full_filename = self.full_filename(filename)
        self.logger.info('Building "{full_filename}" sqlite file'.format(
            full_filename=full_filename))

        with sqlite3.connect(full_filename) as connection:
            for table, method in tables:
                df = method(zip_filename=zip_filename)
                self.logger.info('Writing "{table}" table'.format(table=table))
                df.to_sql(table, con=connection, if_exists=if_exists)
Esempio n. 23
0
from flask import Flask, request, render_template
from sklearn.externals import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import DanishStemmer

nltk.download('stopwords')
stopwords = stopwords.words('danish')
stemmer = DanishStemmer()


def text_process(name):
    """
    Tekstprocessering som laver om til små bogstaver, fjerner stopord og finder ordstammen
    """
    lst = name.lower().split(' ')
    stop = [word for word in lst if word not in stopwords]
    stem = [stemmer.stem(word) for word in stop]

    return stem


pipeline = joblib.load('model/predict_business.pkl')


def predict_business(name):
    return pipeline.predict([name])[0]


app = Flask(__name__)
"""
NER for sermons in content.dat
"""
import os
import pandas as pd
import numpy as np
from polyglot.text import Text
import nltk.data
from nltk.stem.snowball import DanishStemmer
stemmer = DanishStemmer()

if __name__ == "__main__":
    """
    First processing to create NER outputs
    """
    df = pd.read_csv(os.path.join("data", "content", "content.dat", 
                        encoding='utf-8', 
                        header = 0, 
                        index_col = None))

    content = df["content"].tolist()
    fnames = df["id"].tolist()
    tokenizer = nltk.data.load(os.path.join("tokenizers", "punkt", "norwegian.pickle"))

    entity_list = []
    i = 0
    for i, text in enumerate(content):
    #for i, text in enumerate(content[:4]):
        print(f"file {i}")
        # sentence disambiguation
        sents = tokenizer.tokenize(text)