# -*- coding: utf-8 -*-
from utils.utils import Loader
from utils.preprocessing import Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from utils.oddsRatio import OddsRatioCloud
from time import time
import spacy
from nltk.corpus import stopwords

fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8"
train_x,train_y = Loader.load_pres(fname)

stop = list(stopwords.words('french')) # + ['cet', 'cette', 'là']
params = {
    "lowercase":False,
    "punct":False,
    "marker":False,
    "number":False,
    "stemming": Preprocessing.lem, # Preprocessing.stem,
    "ligne": None,
    "strip_accents":False,
    "stopwords": stop # set(stop)
}
f = lambda x: Preprocessing.preprocessing(x,params)
#%%


vectorizer = CountVectorizer(preprocessor = f,lowercase=False,token_pattern = Preprocessing.token_pattern)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model as lin
from sklearn import svm
import sklearn.naive_bayes as nb

from wordcloud import WordCloud
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
from time import time
import spacy
import numpy as np
import pickle
fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8"
alltxts, alllabs = Loader.load_pres(fname)

params = {
    "lowercase": [False, True],
    "punct": [False, True],
    "marker": [False, True],
    "number": [False, True],
    "stemming": [False, Preprocessing.stem],
    "ligne": [None, -2, 0],
    "strip_accents": [False, True],
    "stopwords": [None, stop],  # set(STOPWORDS)],
    "Vectorizer": [CountVectorizer, TfidfVectorizer],
    "binary": [True, False],
    "class_weight": ["balanced", None],
    "max_features": [None, 10000, 7000],
    "ngram_range": [(1, 1), (1, 2)],
Esempio n. 3
0
from utils.utils import Loader

fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8"
alltxts,alllabs = Loader.load_pres(fname)


fname = "Data/AFDpresidentutf8/corpus.tache1.test.utf8"
alltxts_test,alllabs_test = Loader.load_pres(fname)

'''
print(len(alltxts),len(alllabs))
print(alltxts[0])
print(alllabs[0])
print(alltxts[-1])
print(alllabs[-1])

path = "Data/AFDmovies/movies1000/"
alltxts,alllabs = Loader.load_movies(path)
'''