Ejemplo n.º 1
0
def tokenize(text, language='dutch'):
    """ Method turns a text into tokens removing stopwords and stemming them."""
    if language == 'dutch':
        p_stemmer = DutchStemmer()
    else:
        p_stemmer = PorterStemmer()

    text = text.lower()
    stop = set(stopwords.words(language))
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation and len(i) >= 3]
    tokens = [i for i in tokens if i not in stop]
    tokens = [i for i in tokens if i.isalpha()]
    tokens = [p_stemmer.stem(i) for i in tokens]
    return tokens
Ejemplo n.º 2
0
def tokenize(text, language='dutch'):

    if language == 'dutch':
        p_stemmer = DutchStemmer()
    else:
        p_stemmer = PorterStemmer()

    text = text.lower()
    stop = set(stopwords.words(language))
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation and len(i) >= 3]
    tokens = [i for i in tokens if i not in stop]  #Removing stopwords
    tokens = [i for i in tokens if i.isalpha() and 'www' not in i
              ]  #Removing numbers and alphanumeric characters and www
    tokens = [p_stemmer.stem(i) for i in tokens]  #Stemming
    return tokens
    def _tokenize(self, content):
        #Define Artefacts
        artefacts = ['\\n']
        quote = re.compile(r'quote.*(\\n\\n\\n|\\n\[\.\.\.\]\\n\\n|\n)')
        regexs = [quote]

        #Remove unwanted parts of text before tokenization
        for regex in regexs:
            content = regex.sub('', content)

        #Tokenize content into words
        content = regexp_tokenize(content, r'\w+')

        #Remove artifacts in content
        for artefact in artefacts:
            content = [word.replace(artefact, '') for word in content]

        #Stem words
        stemmer = DutchStemmer()
        content = [stemmer.stem(word) for word in content]
        return content
Ejemplo n.º 4
0
 def stem_words(self, wordlist):
     # Checks if stemming is enabled and stems words in wordlist.
     if self.enable_stemmer is not True:
         return wordlist
     if self.language == "english":
         stemmer = PorterStemmer()
     elif self.language == "dutch":
         stemmer = DutchStemmer()
     stemmed_words = []
     for word in wordlist:
         stemmed_words.append(stemmer.stem(word))
     return stemmed_words
Ejemplo n.º 5
0
def tokenize(content):
    #Define Artefacts
    artefacts = ['\\n']
    quote = re.compile(r'quote.*(\\n\\n\\n|\\n\[\.\.\.\]\\n\\n|\n)')
    regexs = [quote]

    #Remove unwanted parts of text before tokenization
    for regex in regexs:
        content = regex.sub('', content)
    
    #Tokenize content into words
    content = regexp_tokenize(content, r'\w+')
    
    #Remove artifacts in content
    for artefact in artefacts:
        content = [word.replace(artefact,'') for word in content]

    #Stem words
    stemmer = DutchStemmer()
    content = [stemmer.stem(word) for word in content]
    return content
Ejemplo n.º 6
0
    def __init__(self, mallet_path, num_topics, corpuspath, dictpath,
                 modelpath):
        self.num_topics = num_topics
        self.mallet_path = mallet_path
        self.corpuspath = corpuspath
        self.modelpath = modelpath
        self.dictpath = dictpath

        self.stopwords = stopwords.words('dutch')
        extra = [
            'mening', 'gevolgen', 'vragen', 'stelling', 'bericht', 'bekend',
            'bereid', 'voornemens'
        ]
        self.stopwords.extend(extra)
        self.stopwords = set(self.stopwords)

        self.stemmer = DutchStemmer()

        # If the corpus and the model exist in the disk, load them.
        try:
            self.model = gensim.models.ldamodel.LdaModel.load(modelpath)
        except FileNotFoundError:
            pass

        try:
            self.dictionary = corpora.Dictionary.load(dictpath)
            # with open(dictpath, 'rb') as file:
            #     self.dictionary = pickle.load(file)
        except FileNotFoundError:
            pass

        try:
            with open(corpuspath, 'rb') as file:
                self.corpus = pickle.load(file)
        except FileNotFoundError:
            pass
Ejemplo n.º 7
0
 def _init_lookup(self):
     nltk.download('stopwords')
     # init stemmer
     self.stemmer = DutchStemmer(ignore_stopwords=True)
     self.stop_words = set(nltk.corpus.stopwords.words('dutch'))
Ejemplo n.º 8
0
class TextClassifier:
    _text = 'Text'
    _main = 'Main'
    _middle = 'Middle'
    _sub = 'Sub'
    _lbl = 'Label'
    model = None

    def __init__(self, *args, **kwargs):
        load_from_disk = kwargs.get('model_from_disk')
        self._init_lookup()
        if load_from_disk:
            self._init_model(load_from_disk)

    def _init_lookup(self):
        nltk.download('stopwords')
        # init stemmer
        self.stemmer = DutchStemmer(ignore_stopwords=True)
        self.stop_words = set(nltk.corpus.stopwords.words('dutch'))

    def _init_model(self, file):
        self.model = joblib.load(file)

    def pickle(self, obj, file):
        joblib.dump(obj, file)

    def export_model(self, file):
        joblib.dump(self.model, file)

    def preprocessor(self, text):
        text = text.lower()
        text = re.sub("\\W", " ", text)  # remove special chars

        # stem words
        words = re.split("\\s+", text)
        stemmed_words = [self.stemmer.stem(word=word) for word in words]
        return ' '.join(stemmed_words)

    def load_data(self, csv_file, frac=1):
        df = pd.read_csv(csv_file, sep=None, engine='python')
        df = df.dropna(
            axis=0,
            how='any',
            thresh=None,
            subset=[self._text, self._main, self._middle, self._sub],
            inplace=False)

        # cleanup dataset
        df = df.drop_duplicates(subset=[self._text], keep='first')
        # for dev use only a subset (for speed purpose)
        df = df.sample(frac=frac).reset_index(drop=True)
        # construct unique label
        df[self._lbl] = df[self._main] + "|" + df[self._middle] + "|" + df[
            self._sub]

        number_of_examples = df[self._lbl].value_counts().to_frame()
        df['is_bigger_than_50'] = df[self._lbl].isin(
            number_of_examples[number_of_examples[self._lbl] > 50].index)
        df['is_bigger_than_50'].value_counts()
        df = df[df['is_bigger_than_50'] == True]
        # The example dataset is not large enough to train a good classification model
        # print(len(self.df),'rows valid')
        return df

    def make_data_sets(self, df, split=0.9, columns=['Middle', 'Sub']):

        texts = df[self._text]
        labels = df[columns].apply('|'.join, axis=1)

        # Splitting data
        splitpoint = int(split * len(texts))

        # train data
        train_texts = texts[:splitpoint]
        train_labels = labels[:splitpoint]

        # test data
        test_texts = texts[splitpoint:]
        test_labels = labels[splitpoint:]

        return texts, labels, train_texts, train_labels, test_texts, test_labels

    def fit(self, train_texts, train_labels):

        pipeline = Pipeline([
            ('vect',
             CountVectorizer(preprocessor=self.preprocessor,
                             stop_words=self.stop_words)),
            ('tfidf', TfidfTransformer()),
            ('clf', LogisticRegression()),
        ])

        # multiple hyperparameters, slow training, better optimization
        parameters_slow = {
            'clf__class_weight': (None, 'balanced'),  #"balanced",
            'clf__max_iter': (300, 500),  #500,1000
            'clf__penalty': ('l1', ),  #'l2',
            'clf__multi_class': ('auto', ),
            'clf__solver': ('liblinear', ),  # lbfgs
            'tfidf__norm': ('l2', ),  # 'l1'
            'tfidf__use_idf': (False, ),
            'vect__max_df': (1.0, ),
            'vect__max_features': (None, ),
            'vect__ngram_range': ((1, 1), (1, 2))  # (1,2)
        }
        # single hyperparameters, fast training, no optimization
        parameters_fast = {
            'clf__class_weight': (None, ),  #"balanced",
            'clf__max_iter': (300, ),  #500,1000
            'clf__penalty': ('l1', ),  #'l2',
            #'clf__multi_class': ('auto',),
            'clf__solver': ('liblinear', ),  # lbfgs
            'tfidf__norm': ('l2', ),  # 'l1'
            'tfidf__use_idf': (False, ),
            'vect__max_df': (1.0, ),
            'vect__max_features': (None, ),
            'vect__ngram_range': ((1, 1), )  # (1,2)
        }

        grid_search = GridSearchCV(pipeline,
                                   parameters_slow,
                                   verbose=True,
                                   n_jobs=psutil.cpu_count(logical=False),
                                   cv=5)
        grid_search.fit(train_texts, train_labels)
        #print('Best parameters: ')
        #print(grid_search.best_params_)
        #print('Best score: ')
        #print(grid_search.best_score_)
        self.model = grid_search
        return grid_search

    def validate_model(self,
                       test_texts,
                       test_labels,
                       dst_file,
                       dst_csv,
                       dst_validation=None):
        from sklearn.metrics import precision_score, recall_score, accuracy_score, plot_confusion_matrix
        import matplotlib.pyplot as plt

        test_predict = self.model.predict(test_texts)
        precision = str(
            round(
                precision_score(test_labels,
                                test_predict,
                                average='macro',
                                zero_division=0), 2))
        recall = str(
            round(recall_score(test_labels, test_predict, average='macro'), 2))
        accuracy = str(round(accuracy_score(test_labels, test_predict), 2))

        plt.rcParams["figure.figsize"] = (30, 30)
        disp = plot_confusion_matrix(self.model,
                                     test_texts,
                                     test_labels,
                                     cmap=plt.cm.Blues,
                                     normalize=None,
                                     xticks_rotation='vertical')
        plt.savefig(dst_file)

        df2 = pd.DataFrame(disp.confusion_matrix, columns=disp.display_labels)
        df2.to_csv(dst_csv)
        if dst_validation:
            with open(dst_validation, 'w') as csvfile:
                fieldnames = ['Text', 'predicted_category', 'actual_category']
                writer = csv.DictWriter(csvfile,
                                        fieldnames=fieldnames,
                                        quoting=csv.QUOTE_NONNUMERIC)
                writer.writeheader()
                for input, prediction, label in zip(test_texts, test_predict,
                                                    test_labels):
                    if prediction != label:
                        writer.writerow({
                            'Text': re.sub("\\W", " ", input),
                            'predicted_category': prediction,
                            'actual_category': label
                        })
        return test_predict, precision, recall, accuracy
Ejemplo n.º 9
0
    def __init__(self):
        from nltk.stem.snowball import DutchStemmer

        self.stemmer = DutchStemmer()
Ejemplo n.º 10
0
class PorterStemmer(object):
    def __init__(self):
        from nltk.stem.snowball import DutchStemmer

        self.stemmer = DutchStemmer()

    def stem(self, w):
        """Stem the word `w`.

        Parameters
        ----------
        w : str

        Returns
        -------
        str
            Stemmed version of `w`.

        Examples
        --------
        >>> from samenvattr.parsing.porter import PorterStemmer
        >>> p = PorterStemmer()
        >>> p.stem("ponies")
        'poni'

        """
        w = w.lower()
        if len(w) <= 2:
            return w  # --DEPARTURE--

        # With this line, strings of length 1 or 2 don't go through the
        # stemming process, although no mention is made of this in the
        # published algorithm. Remove the line to match the published
        # algorithm.

        return self.stemmer.stem(w)

    def stem_sentence(self, txt):
        """Stem the sentence `txt`.

        Parameters
        ----------
        txt : str
            Input sentence.

        Returns
        -------
        str
            Stemmed sentence.

        Examples
        --------
        >>> from samenvattr.parsing.porter import PorterStemmer
        >>> p = PorterStemmer()
        >>> p.stem_sentence("Wow very nice woman with apple")
        'wow veri nice woman with appl'

        """
        return " ".join(self.stemmer.stem(x) for x in txt.split())

    def stem_documents(self, docs):
        """Stem documents.

        Parameters
        ----------
        docs : list of str
            Input documents

        Returns
        -------
        list of str
            Stemmed documents.

        Examples
        --------
        >>> from samenvattr.parsing.porter import PorterStemmer
        >>> p = PorterStemmer()
        >>> p.stem_documents(["Have a very nice weekend", "Have a very nice weekend"])
        ['have a veri nice weekend', 'have a veri nice weekend']

        """
        return [self.stem_sentence(x) for x in docs]
Ejemplo n.º 11
0
####
import fasttext as ft
import pandas as pd
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import DutchStemmer
#uncomment if feeded
#nltk.download("stopwords")
stop_words = set(stopwords.words("dutch"))

#load in the trained word embedding model
word_emb = ft.load_model("model42.bin")

ds = DutchStemmer()

annot_dict = pickle.load(open("FinalAnnotations.p", "rb"))
counter_nested = list()


#this function will be made better accesible through boolean once a final application is needed, for now, it is run several times with different pieces commented/uncommented
def preprocess(sent):
    """
    Split text in tokens, ignore non-alphanumerical tokens, stem the token and delete if in stop_words

    Keyword argument:
    text: the text to preprocess (string)
    """
    tokens = word_tokenize(sent)
Ejemplo n.º 12
0
DESCRIPTION: -
REQUIRES: -
USEFUL: -
Last Updated: 11-05-2018
"""
import gensim
import spacy
from nltk.stem.snowball import DutchStemmer
import json
import os
import re

from gensim import corpora
from nltk import ngrams as ng

stemmer = DutchStemmer()

BASE_URL = "http://api.genius.com"
file_path = os.getcwd() + "/../api_key_genius"
file = open(file_path, 'r', encoding='utf-8')
TOKEN = file.readline()
HEADERS = {'Authorization': "Bearer " + TOKEN}

# TODO implement http://anthology.aclweb.org/C/C14/C14-1059.pdf
# Fell, M., & Sporleder, C. (2014). Lyrics-based Analysis and classification of music.
# In Proceedings of COLING 2014, the 25th International Conference on Computational Linguistics:
# Technical Papers (pp. 620-631).


def generate_word_model():
    'do nothing'
Ejemplo n.º 13
0
    def __init__(self, y_labels=None):

        self.y_labels = y_labels
        self.vectorizer = None  # split vectorizer and estimator to catch zero matching tokens case
        self.estimator = None
        self.stemmer = DutchStemmer()  # initialize stemmer
Ejemplo n.º 14
0
class PartyClassifier:
    def __init__(self, y_labels=None):

        self.y_labels = y_labels
        self.vectorizer = None  # split vectorizer and estimator to catch zero matching tokens case
        self.estimator = None
        self.stemmer = DutchStemmer()  # initialize stemmer

    def fit(self, X, y):

        self.vectorizer = TfidfVectorizer(input='content',
                                          encoding='utf-8',
                                          decode_error='strict',
                                          strip_accents='unicode',
                                          lowercase=True,
                                          preprocessor=None,
                                          tokenizer=self.__tokenize,
                                          analyzer='word',
                                          stop_words=stopwords.words('dutch'),
                                          ngram_range=(1, 3),
                                          max_df=0.5,
                                          min_df=1,
                                          max_features=None,
                                          vocabulary=None,
                                          binary=False,
                                          dtype=np.int64,
                                          norm='l2',
                                          use_idf=True,
                                          smooth_idf=True,
                                          sublinear_tf=False)

        self.estimator = Pipeline(
            steps=[('topic_model',
                    TruncatedSVD(n_components=100,
                                 algorithm='randomized',
                                 n_iter=10,
                                 random_state=12,
                                 tol=0.0)),
                   ('classifier',
                    LogisticRegression(multi_class='multinomial',
                                       class_weight='balanced',
                                       solver='lbfgs'))])

        self.estimator.fit(self.vectorizer.fit_transform(X), y)

        return self

    def predict(self, X):

        return self.predict_proba(X)

    def predict_proba(self, X):

        n_labels = len(self.y_labels)
        n_samples = len(X)

        X_vectorized = self.vectorizer.transform(X)

        # output equal probabilities in case of zero matching tokens
        if X_vectorized.getnnz() > 0:
            return self.estimator.predict_proba(X_vectorized)
        else:
            return np.ones([n_samples, n_labels]).astype(float) / n_labels

    def __tokenize(self, text):
        """Converts text to tokens."""
        tokens = word_tokenize(text, language='dutch')
        tokens = filter(lambda x: len(x) > 1, tokens)
        stemmed = []
        for item in tokens:
            stemmed.append(self.stemmer.stem(item))

        return stemmed
Ejemplo n.º 15
0
    return text


vocabulary = dict()
inverse_vocabulary = ['<unk>']
# '<unk>' will never be used, it is only a placeholder for the
# [0, 0, ....0] embedding

print("loading word2vec")
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=False)
print("done loading word2vec")

questions_cols = ['question1', 'question2']

stemmer = DutchStemmer()

# Iterate over the questions only of both training and test datasets
for dataset in [train_df]:
    for index, row in dataset.iterrows():

        # Iterate through the text of both questions of the row
        for question in questions_cols:

            q2n = []  # q2n -> question numbers representation
            for word in text_to_word_list(row[question]):
                word = stemmer.stem(word)
                # Check for unwanted words
                if word in stops and word not in word2vec.vocab:
                    continue