Beispiel #1
0
 def test_tokenize(self):
     tokenizer = NOTokenizer()
     self.assertEqual(['Dette', 'er', u'vårt', 'hus', '.'],
                      tokenizer.tokenize(u'Dette er vårt hus.'))
Beispiel #2
0
import sys

from gensim.corpora import Dictionary
from textblob import TextBlob

from es_text_analytics.data import newsgroups
from es_text_analytics.data.dataset import download_file, default_dataset_path
from es_text_analytics.data.ndt_dataset import NDTDataset
from es_text_analytics.tokenizer import NOTokenizer
"""
Generates wordcounts from a dataset.

Stores the counts in a Gensim Dictionary text file with id, word and count as tab separated fields.
"""

NO_TOKENIZER = NOTokenizer()


def preprocess_ng(doc):
    return [w.lower() for w in TextBlob(doc['msg']).words]


def preprocess_ndt(doc):
    return [
        w.lower()
        for w in TextBlob(doc['content'], tokenizer=NO_TOKENIZER).words
    ]


def main():
    parser = ArgumentParser()
Beispiel #3
0
 def __init__(self, model_fn=None):
     self.tokenizer = NOTokenizer()
     self.tagger = HunposTagger(NNO_TAGGER_DEFAULT_MODEL_FN,
                                hunpos_tag_bin(),
                                encoding='utf-8')