import os import csv import math import nltk import constant import numpy as np import collections from nltk.corpus import stopwords from nltk.stem import SnowballStemmer from sklearn.linear_model import RidgeClassifier from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split nltk.download("stopwords") stemmer = SnowballStemmer('russian') stemmer.stopwords = stopwords.words("russian") def get_data() -> list: label = 0 labels = list() messages = list() messageGroups = list() for names in os.listdir(constant.PATH): with open(constant.PATH + names, newline='', encoding=constant.ENCODING) as csvfile: reader = csv.reader(csvfile) next(reader) messageGroup = list(np.array(list(reader)[:800])[:, 4]) messageGroups.append(messageGroup)
def get_stemmer(): stemmer = SnowballStemmer('spanish') stemmer.stopwords = set(STOP_WORDS) return stemmer