def tokenize(text, language='dutch'): """ Method turns a text into tokens removing stopwords and stemming them.""" if language == 'dutch': p_stemmer = DutchStemmer() else: p_stemmer = PorterStemmer() text = text.lower() stop = set(stopwords.words(language)) tokens = nltk.word_tokenize(text) tokens = [i for i in tokens if i not in string.punctuation and len(i) >= 3] tokens = [i for i in tokens if i not in stop] tokens = [i for i in tokens if i.isalpha()] tokens = [p_stemmer.stem(i) for i in tokens] return tokens
def tokenize(text, language='dutch'): if language == 'dutch': p_stemmer = DutchStemmer() else: p_stemmer = PorterStemmer() text = text.lower() stop = set(stopwords.words(language)) tokens = nltk.word_tokenize(text) tokens = [i for i in tokens if i not in string.punctuation and len(i) >= 3] tokens = [i for i in tokens if i not in stop] #Removing stopwords tokens = [i for i in tokens if i.isalpha() and 'www' not in i ] #Removing numbers and alphanumeric characters and www tokens = [p_stemmer.stem(i) for i in tokens] #Stemming return tokens
def _tokenize(self, content): #Define Artefacts artefacts = ['\\n'] quote = re.compile(r'quote.*(\\n\\n\\n|\\n\[\.\.\.\]\\n\\n|\n)') regexs = [quote] #Remove unwanted parts of text before tokenization for regex in regexs: content = regex.sub('', content) #Tokenize content into words content = regexp_tokenize(content, r'\w+') #Remove artifacts in content for artefact in artefacts: content = [word.replace(artefact, '') for word in content] #Stem words stemmer = DutchStemmer() content = [stemmer.stem(word) for word in content] return content
def stem_words(self, wordlist): # Checks if stemming is enabled and stems words in wordlist. if self.enable_stemmer is not True: return wordlist if self.language == "english": stemmer = PorterStemmer() elif self.language == "dutch": stemmer = DutchStemmer() stemmed_words = [] for word in wordlist: stemmed_words.append(stemmer.stem(word)) return stemmed_words
def tokenize(content): #Define Artefacts artefacts = ['\\n'] quote = re.compile(r'quote.*(\\n\\n\\n|\\n\[\.\.\.\]\\n\\n|\n)') regexs = [quote] #Remove unwanted parts of text before tokenization for regex in regexs: content = regex.sub('', content) #Tokenize content into words content = regexp_tokenize(content, r'\w+') #Remove artifacts in content for artefact in artefacts: content = [word.replace(artefact,'') for word in content] #Stem words stemmer = DutchStemmer() content = [stemmer.stem(word) for word in content] return content
def __init__(self, mallet_path, num_topics, corpuspath, dictpath, modelpath): self.num_topics = num_topics self.mallet_path = mallet_path self.corpuspath = corpuspath self.modelpath = modelpath self.dictpath = dictpath self.stopwords = stopwords.words('dutch') extra = [ 'mening', 'gevolgen', 'vragen', 'stelling', 'bericht', 'bekend', 'bereid', 'voornemens' ] self.stopwords.extend(extra) self.stopwords = set(self.stopwords) self.stemmer = DutchStemmer() # If the corpus and the model exist in the disk, load them. try: self.model = gensim.models.ldamodel.LdaModel.load(modelpath) except FileNotFoundError: pass try: self.dictionary = corpora.Dictionary.load(dictpath) # with open(dictpath, 'rb') as file: # self.dictionary = pickle.load(file) except FileNotFoundError: pass try: with open(corpuspath, 'rb') as file: self.corpus = pickle.load(file) except FileNotFoundError: pass
def _init_lookup(self): nltk.download('stopwords') # init stemmer self.stemmer = DutchStemmer(ignore_stopwords=True) self.stop_words = set(nltk.corpus.stopwords.words('dutch'))
class TextClassifier: _text = 'Text' _main = 'Main' _middle = 'Middle' _sub = 'Sub' _lbl = 'Label' model = None def __init__(self, *args, **kwargs): load_from_disk = kwargs.get('model_from_disk') self._init_lookup() if load_from_disk: self._init_model(load_from_disk) def _init_lookup(self): nltk.download('stopwords') # init stemmer self.stemmer = DutchStemmer(ignore_stopwords=True) self.stop_words = set(nltk.corpus.stopwords.words('dutch')) def _init_model(self, file): self.model = joblib.load(file) def pickle(self, obj, file): joblib.dump(obj, file) def export_model(self, file): joblib.dump(self.model, file) def preprocessor(self, text): text = text.lower() text = re.sub("\\W", " ", text) # remove special chars # stem words words = re.split("\\s+", text) stemmed_words = [self.stemmer.stem(word=word) for word in words] return ' '.join(stemmed_words) def load_data(self, csv_file, frac=1): df = pd.read_csv(csv_file, sep=None, engine='python') df = df.dropna( axis=0, how='any', thresh=None, subset=[self._text, self._main, self._middle, self._sub], inplace=False) # cleanup dataset df = df.drop_duplicates(subset=[self._text], keep='first') # for dev use only a subset (for speed purpose) df = df.sample(frac=frac).reset_index(drop=True) # construct unique label df[self._lbl] = df[self._main] + "|" + df[self._middle] + "|" + df[ self._sub] number_of_examples = df[self._lbl].value_counts().to_frame() df['is_bigger_than_50'] = df[self._lbl].isin( number_of_examples[number_of_examples[self._lbl] > 50].index) df['is_bigger_than_50'].value_counts() df = df[df['is_bigger_than_50'] == True] # The example dataset is not large enough to train a good classification model # print(len(self.df),'rows valid') return df def make_data_sets(self, df, split=0.9, columns=['Middle', 'Sub']): texts = df[self._text] labels = df[columns].apply('|'.join, axis=1) # Splitting data splitpoint = int(split * len(texts)) # train data train_texts = texts[:splitpoint] train_labels = labels[:splitpoint] # test data test_texts = texts[splitpoint:] test_labels = labels[splitpoint:] return texts, labels, train_texts, train_labels, test_texts, test_labels def fit(self, train_texts, train_labels): pipeline = Pipeline([ ('vect', CountVectorizer(preprocessor=self.preprocessor, stop_words=self.stop_words)), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression()), ]) # multiple hyperparameters, slow training, better optimization parameters_slow = { 'clf__class_weight': (None, 'balanced'), #"balanced", 'clf__max_iter': (300, 500), #500,1000 'clf__penalty': ('l1', ), #'l2', 'clf__multi_class': ('auto', ), 'clf__solver': ('liblinear', ), # lbfgs 'tfidf__norm': ('l2', ), # 'l1' 'tfidf__use_idf': (False, ), 'vect__max_df': (1.0, ), 'vect__max_features': (None, ), 'vect__ngram_range': ((1, 1), (1, 2)) # (1,2) } # single hyperparameters, fast training, no optimization parameters_fast = { 'clf__class_weight': (None, ), #"balanced", 'clf__max_iter': (300, ), #500,1000 'clf__penalty': ('l1', ), #'l2', #'clf__multi_class': ('auto',), 'clf__solver': ('liblinear', ), # lbfgs 'tfidf__norm': ('l2', ), # 'l1' 'tfidf__use_idf': (False, ), 'vect__max_df': (1.0, ), 'vect__max_features': (None, ), 'vect__ngram_range': ((1, 1), ) # (1,2) } grid_search = GridSearchCV(pipeline, parameters_slow, verbose=True, n_jobs=psutil.cpu_count(logical=False), cv=5) grid_search.fit(train_texts, train_labels) #print('Best parameters: ') #print(grid_search.best_params_) #print('Best score: ') #print(grid_search.best_score_) self.model = grid_search return grid_search def validate_model(self, test_texts, test_labels, dst_file, dst_csv, dst_validation=None): from sklearn.metrics import precision_score, recall_score, accuracy_score, plot_confusion_matrix import matplotlib.pyplot as plt test_predict = self.model.predict(test_texts) precision = str( round( precision_score(test_labels, test_predict, average='macro', zero_division=0), 2)) recall = str( round(recall_score(test_labels, test_predict, average='macro'), 2)) accuracy = str(round(accuracy_score(test_labels, test_predict), 2)) plt.rcParams["figure.figsize"] = (30, 30) disp = plot_confusion_matrix(self.model, test_texts, test_labels, cmap=plt.cm.Blues, normalize=None, xticks_rotation='vertical') plt.savefig(dst_file) df2 = pd.DataFrame(disp.confusion_matrix, columns=disp.display_labels) df2.to_csv(dst_csv) if dst_validation: with open(dst_validation, 'w') as csvfile: fieldnames = ['Text', 'predicted_category', 'actual_category'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC) writer.writeheader() for input, prediction, label in zip(test_texts, test_predict, test_labels): if prediction != label: writer.writerow({ 'Text': re.sub("\\W", " ", input), 'predicted_category': prediction, 'actual_category': label }) return test_predict, precision, recall, accuracy
def __init__(self): from nltk.stem.snowball import DutchStemmer self.stemmer = DutchStemmer()
class PorterStemmer(object): def __init__(self): from nltk.stem.snowball import DutchStemmer self.stemmer = DutchStemmer() def stem(self, w): """Stem the word `w`. Parameters ---------- w : str Returns ------- str Stemmed version of `w`. Examples -------- >>> from samenvattr.parsing.porter import PorterStemmer >>> p = PorterStemmer() >>> p.stem("ponies") 'poni' """ w = w.lower() if len(w) <= 2: return w # --DEPARTURE-- # With this line, strings of length 1 or 2 don't go through the # stemming process, although no mention is made of this in the # published algorithm. Remove the line to match the published # algorithm. return self.stemmer.stem(w) def stem_sentence(self, txt): """Stem the sentence `txt`. Parameters ---------- txt : str Input sentence. Returns ------- str Stemmed sentence. Examples -------- >>> from samenvattr.parsing.porter import PorterStemmer >>> p = PorterStemmer() >>> p.stem_sentence("Wow very nice woman with apple") 'wow veri nice woman with appl' """ return " ".join(self.stemmer.stem(x) for x in txt.split()) def stem_documents(self, docs): """Stem documents. Parameters ---------- docs : list of str Input documents Returns ------- list of str Stemmed documents. Examples -------- >>> from samenvattr.parsing.porter import PorterStemmer >>> p = PorterStemmer() >>> p.stem_documents(["Have a very nice weekend", "Have a very nice weekend"]) ['have a veri nice weekend', 'have a veri nice weekend'] """ return [self.stem_sentence(x) for x in docs]
#### import fasttext as ft import pandas as pd import pickle from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from sklearn.model_selection import train_test_split from nltk.stem.snowball import DutchStemmer #uncomment if feeded #nltk.download("stopwords") stop_words = set(stopwords.words("dutch")) #load in the trained word embedding model word_emb = ft.load_model("model42.bin") ds = DutchStemmer() annot_dict = pickle.load(open("FinalAnnotations.p", "rb")) counter_nested = list() #this function will be made better accesible through boolean once a final application is needed, for now, it is run several times with different pieces commented/uncommented def preprocess(sent): """ Split text in tokens, ignore non-alphanumerical tokens, stem the token and delete if in stop_words Keyword argument: text: the text to preprocess (string) """ tokens = word_tokenize(sent)
DESCRIPTION: - REQUIRES: - USEFUL: - Last Updated: 11-05-2018 """ import gensim import spacy from nltk.stem.snowball import DutchStemmer import json import os import re from gensim import corpora from nltk import ngrams as ng stemmer = DutchStemmer() BASE_URL = "http://api.genius.com" file_path = os.getcwd() + "/../api_key_genius" file = open(file_path, 'r', encoding='utf-8') TOKEN = file.readline() HEADERS = {'Authorization': "Bearer " + TOKEN} # TODO implement http://anthology.aclweb.org/C/C14/C14-1059.pdf # Fell, M., & Sporleder, C. (2014). Lyrics-based Analysis and classification of music. # In Proceedings of COLING 2014, the 25th International Conference on Computational Linguistics: # Technical Papers (pp. 620-631). def generate_word_model(): 'do nothing'
def __init__(self, y_labels=None): self.y_labels = y_labels self.vectorizer = None # split vectorizer and estimator to catch zero matching tokens case self.estimator = None self.stemmer = DutchStemmer() # initialize stemmer
class PartyClassifier: def __init__(self, y_labels=None): self.y_labels = y_labels self.vectorizer = None # split vectorizer and estimator to catch zero matching tokens case self.estimator = None self.stemmer = DutchStemmer() # initialize stemmer def fit(self, X, y): self.vectorizer = TfidfVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents='unicode', lowercase=True, preprocessor=None, tokenizer=self.__tokenize, analyzer='word', stop_words=stopwords.words('dutch'), ngram_range=(1, 3), max_df=0.5, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False) self.estimator = Pipeline( steps=[('topic_model', TruncatedSVD(n_components=100, algorithm='randomized', n_iter=10, random_state=12, tol=0.0)), ('classifier', LogisticRegression(multi_class='multinomial', class_weight='balanced', solver='lbfgs'))]) self.estimator.fit(self.vectorizer.fit_transform(X), y) return self def predict(self, X): return self.predict_proba(X) def predict_proba(self, X): n_labels = len(self.y_labels) n_samples = len(X) X_vectorized = self.vectorizer.transform(X) # output equal probabilities in case of zero matching tokens if X_vectorized.getnnz() > 0: return self.estimator.predict_proba(X_vectorized) else: return np.ones([n_samples, n_labels]).astype(float) / n_labels def __tokenize(self, text): """Converts text to tokens.""" tokens = word_tokenize(text, language='dutch') tokens = filter(lambda x: len(x) > 1, tokens) stemmed = [] for item in tokens: stemmed.append(self.stemmer.stem(item)) return stemmed
return text vocabulary = dict() inverse_vocabulary = ['<unk>'] # '<unk>' will never be used, it is only a placeholder for the # [0, 0, ....0] embedding print("loading word2vec") word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=False) print("done loading word2vec") questions_cols = ['question1', 'question2'] stemmer = DutchStemmer() # Iterate over the questions only of both training and test datasets for dataset in [train_df]: for index, row in dataset.iterrows(): # Iterate through the text of both questions of the row for question in questions_cols: q2n = [] # q2n -> question numbers representation for word in text_to_word_list(row[question]): word = stemmer.stem(word) # Check for unwanted words if word in stops and word not in word2vec.vocab: continue