# -*- coding: utf-8 -*- from utils.utils import Loader from utils.preprocessing import Preprocessing from sklearn.feature_extraction.text import CountVectorizer import numpy as np from wordcloud import WordCloud import matplotlib.pyplot as plt from utils.oddsRatio import OddsRatioCloud from time import time import spacy from nltk.corpus import stopwords fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8" train_x,train_y = Loader.load_pres(fname) stop = list(stopwords.words('french')) # + ['cet', 'cette', 'là'] params = { "lowercase":False, "punct":False, "marker":False, "number":False, "stemming": Preprocessing.lem, # Preprocessing.stem, "ligne": None, "strip_accents":False, "stopwords": stop # set(stop) } f = lambda x: Preprocessing.preprocessing(x,params) #%% vectorizer = CountVectorizer(preprocessor = f,lowercase=False,token_pattern = Preprocessing.token_pattern)
from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn import linear_model as lin from sklearn import svm import sklearn.naive_bayes as nb from wordcloud import WordCloud from nltk.corpus import stopwords import matplotlib.pyplot as plt from time import time import spacy import numpy as np import pickle fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8" alltxts, alllabs = Loader.load_pres(fname) params = { "lowercase": [False, True], "punct": [False, True], "marker": [False, True], "number": [False, True], "stemming": [False, Preprocessing.stem], "ligne": [None, -2, 0], "strip_accents": [False, True], "stopwords": [None, stop], # set(STOPWORDS)], "Vectorizer": [CountVectorizer, TfidfVectorizer], "binary": [True, False], "class_weight": ["balanced", None], "max_features": [None, 10000, 7000], "ngram_range": [(1, 1), (1, 2)],
from utils.utils import Loader fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8" alltxts,alllabs = Loader.load_pres(fname) fname = "Data/AFDpresidentutf8/corpus.tache1.test.utf8" alltxts_test,alllabs_test = Loader.load_pres(fname) ''' print(len(alltxts),len(alllabs)) print(alltxts[0]) print(alllabs[0]) print(alltxts[-1]) print(alllabs[-1]) path = "Data/AFDmovies/movies1000/" alltxts,alllabs = Loader.load_movies(path) '''