Ejemplo n.º 1
0
import os
import csv
import math
import nltk
import constant
import numpy as np
import collections
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

nltk.download("stopwords")
stemmer = SnowballStemmer('russian')
stemmer.stopwords = stopwords.words("russian")


def get_data() -> list:
    label = 0
    labels = list()
    messages = list()
    messageGroups = list()
    for names in os.listdir(constant.PATH):
        with open(constant.PATH + names,
                  newline='',
                  encoding=constant.ENCODING) as csvfile:
            reader = csv.reader(csvfile)
            next(reader)
            messageGroup = list(np.array(list(reader)[:800])[:, 4])
            messageGroups.append(messageGroup)
Ejemplo n.º 2
0
def get_stemmer():
    stemmer = SnowballStemmer('spanish')
    stemmer.stopwords = set(STOP_WORDS)
    return stemmer