コード例 #1
0
ファイル: Tokenizer.py プロジェクト: BogiThomsen/WICrawler
def tokenize_and_stem(text):
    ## check if eng or dk
    filtered_text = re.sub('[^\w\-\'/]', ' ', text)
    lang = detect(filtered_text)
    if lang in valid_languages:
        tokens = filtered_text.lower().split(" ")
        processed = []

        stopwords = []
        if lang == "da":
            snowball = DanishStemmer()
            stopwords = dk_stop
        if lang == "en":
            snowball = EnglishStemmer()
            stopwords = eng_stop

        for token in tokens:
            if token in stopwords:
                continue
            elif "\n" in token:
                continue
            elif "\\n" in token:
                continue
            elif token == "":
                continue
            elif token.isdigit():
                continue
            else:
                processed.append(token)
        stemmed = []
        for token in processed:
            stemmed.append(snowball.stem(token))
        return stemmed
コード例 #2
0
def word_normalize(s: str, method: str = "l") -> str:
    """
    Splits a string and lemmatizes every single word, except acronyms
    """
    if method not in ["s", "l"]:
        raise ValueError("Method must be either 's' or 'l' for either"
                         "stemming or lemmatizing")

    # TODO: change this to match language
    lemmatizer = lemmy.load("da")
    stemmer = DanishStemmer()

    words = s.split(" ")

    norm_words = []
    for w in words:
        if w.isupper():
            norm_words.append(w)
        else:
            if method == "l":
                w = lemmatizer.lemmatize("", w)
                norm_words.extend(w)
            else:
                w = stemmer.stem(w)
                norm_words.append(w)

    return " ".join(norm_words)
コード例 #3
0
    def __init__(self):
        """Set up tokenizer."""
        self.logger = logging.getLogger(__name__ + '.Corpus')
        self.logger.addHandler(logging.NullHandler())

        self.logger.debug('Setup word tokenizer')
        self.word_tokenizer = WordPunctTokenizer()

        self.logger.debug('Setup stemmer')
        self.stemmer = DanishStemmer()
コード例 #4
0
    def __init__(self):
        """Initialize logger and and database."""
        self.logger = logging.getLogger(__name__ + '.Dannet')
        self.logger.addHandler(logging.NullHandler())

        self.logger.debug('Initializing tokenizer and stemmer')
        self.word_tokenizer = WordPunctTokenizer()
        self.stemmer = DanishStemmer()

        self._db = None
コード例 #5
0
    def __init__(self):
        """Set up data directory and other attributes."""
        self.logger = logging.getLogger('dasem.gutenberg.Gutenberg')
        self.logger.addHandler(logging.NullHandler())

        self.data_directory = join(data_directory(), 'gutenberg',
                                   'aleph.gutenberg.org')
        self.sentence_tokenizer = nltk.data.load(
            'tokenizers/punkt/danish.pickle')
        self.whitespaces_pattern = re.compile('\s+',
                                              flags=re.DOTALL | re.UNICODE)
        self.word_tokenizer = WordPunctTokenizer()
        self.stemmer = DanishStemmer()
コード例 #6
0
ファイル: helper.py プロジェクト: Proteusiq/DanishSentiments
def token(X,
          words_only=False,
          word_normalize=True,
          emoji_normalize=True,
          remove_digits=True,
          lower_case=True,
          stop_words=None):
    '''
        requires Stemming if word_normalize = True
        '''

    # eyes [nose] mouth | mouth [nose] eyes pattern
    emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)"
    emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE)

    # Keep word only. Digit are consider true Emojis false
    if words_only:
        clean_text = re.sub('[\W]+', ' ', X)
    else:
        clean_text = '{}{}'.format(re.sub('[\W]+', ' ', X),
                                   ''.join(re.findall(emoticon_re, X)))

    # normalize emoji?
    if emoji_normalize:

        clean_text = (re.sub('[\W]+', ' ', X) + ' '.join(
            re.findall(emoticon_re, X)).replace(';', ':').replace('-', ''))

    if remove_digits:
        clean_text = clean_text.translate(str.maketrans('', '', '0123456789'))

    if lower_case:
        clean_text = clean_text.lower()

    if word_normalize:
        stemmer = DanishStemmer()

        clean_text = ' '.join(
            stemmer.stem(word) for word in clean_text.split())

    if stop_words:

        return [word for word in clean_text.split() if word not in stop_words]
    else:
        return clean_text.split()
コード例 #7
0
    def __init__(self,
                 danish_filename=DANISH_FILENAME,
                 tar_gz_filename=TGZ_PARALLEL_CORPUS_FILENAME):
        """Set up filename.

        Parameters
        ----------
        danish_filename : str
            Filename for '.da' file in the tar.gz file.
        tar_gz_filename : str
            Filename for tar.gz or tgz file with Danish/English.

        """
        self.logger = logging.getLogger(__name__ + '.Europarl')
        self.logger.addHandler(logging.NullHandler())

        self.tar_gz_filename = tar_gz_filename
        self.danish_filename = danish_filename

        self.word_tokenizer = WordPunctTokenizer()
        self.stemmer = DanishStemmer()
コード例 #8
0
ファイル: wikipedia.py プロジェクト: lisbethwaagstein/dasem
    def __init__(self, filename=BZ2_XML_DUMP_FILENAME):
        """Prepare dump file for reading.

        Parameters
        ----------
        filename : str
            Filename or the XML dump file.

        """
        self.logger = logging.getLogger(__name__)
        self.logger.addHandler(logging.NullHandler())

        full_filename = self.full_filename(filename)
        self.filename = full_filename

        self.sentence_tokenizer = nltk.data.load(
            'tokenizers/punkt/danish.pickle')
        self.whitespaces_pattern = re.compile(
            '\s+', flags=re.DOTALL | re.UNICODE)
        self.word_tokenizer = WordPunctTokenizer()
        self.stemmer = DanishStemmer()

        self.word_pattern = re.compile(
            r"""{{.+?}}|
            <!--.+?-->|
            \[\[Fil.+?\]\]|
            \[\[Kategori:.+?\]\]|
            \[http.+?\]|(\w+(?:-\w+)*)""",
            flags=re.UNICODE | re.VERBOSE | re.DOTALL)
        self.paragraph_split_pattern = re.compile(
            r'\n\s*\n', flags=re.DOTALL | re.UNICODE)
        self.ignored_words_pattern = re.compile(
            r"""
            (?:(?:thumb|thumbnail|left|right|\d+px|upright(?:=[0-9\.]+)?)\|)+
            |^\s*\|.+$
            |^REDIRECT\b""",
            flags=re.DOTALL | re.UNICODE | re.VERBOSE | re.MULTILINE)
        self.itemized_split_pattern = re.compile(
            r"^ |^Kategori:",
            flags=re.DOTALL | re.UNICODE | re.MULTILINE)
コード例 #9
0
 def __init__(self, lib_path=THIRD_PATY_PATH):
     """Constructor. Initialize class attributes.
     """
     self.word2idx = {}
     self.idx2word = {}
     self.words = []
     self.raw_data = []
     self.seq_data = None
     self.count = None
     self.word_ctx = defaultdict(set)
     self.pw_data = []
     self.docs = defaultdict(dict)
     self.lines = defaultdict(list)
     self.re_rules = []
     self.lib_path = lib_path
     self.ner_tagger = self.set_ner_tagger(NER_MODEL, NER_JAR)
     self.pos_tagger = self.set_pos_tagger(POS_MODEL, POS_JAR)
     self.pt_stemmer = PorterStemmer()
     self.dan_stemmer = DanishStemmer()
     self.lemma = WordNetLemmatizer()
     self.tfidf = TfidfVectorizer(tokenizer=self.tokenize,
                                  stop_words='english')
コード例 #10
0
def preprocess_text(text):
    # text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = str(text).lower().strip()

    # caveat: this might conflict with the english text
    da_stop_words = stopwords.words('danish')
    stemmer = DanishStemmer()
    lemmatizer = lemmy.load("da")

    # remove plurals
    textblob = TextBlob(text)
    singles = [stemmer.stem(word) for word in textblob.words]

    # remove danish stopwords
    no_stop_words = [word for word in singles if word not in da_stop_words]

    # join text so it can be lemmatized
    joined_text = " ".join(no_stop_words)

    # lemmatization
    final_text = lemmatizer.lemmatize("", joined_text)

    return final_text[0]
コード例 #11
0
ファイル: pyrouge.py プロジェクト: SebastianVeile/PreSumm
    def convert_text_to_rouge_format(text, title="dummy title"):
        """
        Convert a text to a format ROUGE understands. The text is
        assumed to contain one sentence per line.

            text:   The text to convert, containg one sentence per line.
            title:  Optional title for the text. The title will appear
                    in the converted file, but doesn't seem to have
                    any other relevance.

        Returns: The converted text as string.

        """
        # sentences = text.split("\n")
        from nltk.stem.snowball import DanishStemmer
        stemmer = DanishStemmer()
        sentences = text.split("<q>")
        output = []
        for sentence in sentences:
            output.append(" ".join([stemmer.stem(i)
                                    for i in sentence.split()]))
        sent_elems = [
            "<a name=\"{i}\">[{i}]</a> <a href=\"#{i}\" id={i}>"
            "{text}</a>".format(i=i, text=sent)
            for i, sent in enumerate(output, start=1)
        ]
        html = """<html>
<head>
<title>{title}</title>
</head>
<body bgcolor="white">
{elems}
</body>
</html>""".format(title=title, elems="\n".join(sent_elems))

        return html
コード例 #12
0
from flask import Flask, request, render_template
from sklearn.externals import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import DanishStemmer

nltk.download('stopwords')
stopwords = stopwords.words('danish')
stemmer = DanishStemmer()


def text_process(name):
    """
    Tekstprocessering som laver om til små bogstaver, fjerner stopord og finder ordstammen
    """
    lst = name.lower().split(' ')
    stop = [word for word in lst if word not in stopwords]
    stem = [stemmer.stem(word) for word in stop]

    return stem


pipeline = joblib.load('model/predict_business.pkl')


def predict_business(name):
    return pipeline.predict([name])[0]


app = Flask(__name__)
コード例 #13
0
ファイル: sto_lemmatizer.py プロジェクト: coastalcph/cs_sst
 def __init__(self):
     self._read_sto_mapping()
     self._read_sto_words()
     self._stemmer = DanishStemmer()
コード例 #14
0
def stem_lem(words, documents, stem_or_lem: bool = False):
    """
    Updates a word list and a corpus to use stemmed words.
    :param stem_or_lem: bool indicating whether to apply stemming or lemmatizer. True is stem, False is lem.
    :param corpus: a list of sentences (strings of words separated by spaces)
    :param words: a list of words
    :return: new corpus and words list, were all words have been replaced by stemmed/lemmetized versions.
    """
    stop_words = stopwords.words('danish')
    stop_words.extend(
        list(utility.load_vector_file("word_datasets/stopwords.csv").values()))
    if stem_or_lem:
        # Stemming
        stemmer = DanishStemmer()
        # Update word list to use stemmed words
        translator = {}
        add = []
        remove = []
        for word in tqdm(words):
            stem = stemmer.stem(word)
            if stem != word:
                if word not in remove:
                    remove.append(word)
                if stem not in add and stem not in stop_words:
                    add.append(stem)
                if word not in translator and stem not in stop_words:
                    translator[word] = stem
        words = [x for x in words if x not in remove]
        words.extend([x for x in add if x not in words])
    else:
        lemmer = lemmy.load("da")
        # build up dictionary that translates old words into their new versions
        translator = {}
        add = []
        remove = []
        for word in tqdm(words):
            lem = lemmer.lemmatize("", word)
            other = [x for x in lem if x != word]
            if len(other) > 0:
                if word not in lem and word not in remove:
                    remove.append(word)
                # add all lem options if they are not stopwords
                add.extend(
                    [x for x in lem if x not in stop_words and x not in add])
                if word not in translator and lem not in stop_words:
                    lem = " ".join(lem)
                    translator[word] = lem
        words = [x for x in words if x not in remove]
        words.extend([x for x in add if x not in words])

    # update corpus to use stemmed words
    for x in tqdm(range(len(documents))):
        sentence = documents[x]
        for i in range(len(sentence)):
            word = sentence[i]
            if word in translator:
                sentence[i] = translator[word]
        sentence = ' '.join(sentence)
        sentence = sentence.split(' ')
        documents[x] = sentence

    diction = gensim.corpora.Dictionary(documents)
    d_words = diction.token2id
    good_ids = [d_words[x] for x in words]
    diction.filter_tokens(good_ids=good_ids)
    diction.compactify()

    return diction, documents