Exemple #1
0
            if x == -1 or x == 0:
                f.write(str(classes[0]))
            else:
                f.write(str(classes[1]))
            f.write("\n")
    return res


"""      
X_train,Y_train = Loader.load_pres(fname)
X_test, _ = Loader.load_pres(tname)

result = predict(X_train, Y_train, X_test, save = "auteurs.txt", classes = ["M","C"], post_processing=True)

fig,ax = plt.subplots(figsize=(35,100)) 
ax.imshow(result.reshape(54,-1),interpolation="nearest")
"""
# plt.tight_layout()

X_train, Y_train = Loader.load_movies(fname_2)
X_test = Loader.load_movies_test(tname_2)

result_sent = predict(X_train,
                      Y_train,
                      X_test,
                      params=params_sentiments,
                      save="sentiments.txt",
                      classes=["-1", "1"],
                      post_processing=False,
                      equilibrage=False)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model as lin
from sklearn import svm
import sklearn.naive_bayes as nb

from wordcloud import WordCloud
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
from time import time
import spacy
import numpy as np
import pickle
fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8"
alltxts, alllabs = Loader.load_pres(fname)

params = {
    "lowercase": [False, True],
    "punct": [False, True],
    "marker": [False, True],
    "number": [False, True],
    "stemming": [False, Preprocessing.stem],
    "ligne": [None, -2, 0],
    "strip_accents": [False, True],
    "stopwords": [None, stop],  # set(STOPWORDS)],
    "Vectorizer": [CountVectorizer, TfidfVectorizer],
    "binary": [True, False],
    "class_weight": ["balanced", None],
    "max_features": [None, 10000, 7000],
    "ngram_range": [(1, 1), (1, 2)],
# -*- coding: utf-8 -*-
from utils.utils import Loader
from utils.preprocessing import Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from utils.oddsRatio import OddsRatioCloud
from time import time
import spacy
from nltk.corpus import stopwords

fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8"
train_x,train_y = Loader.load_pres(fname)

stop = list(stopwords.words('french')) # + ['cet', 'cette', 'là']
params = {
    "lowercase":False,
    "punct":False,
    "marker":False,
    "number":False,
    "stemming": Preprocessing.lem, # Preprocessing.stem,
    "ligne": None,
    "strip_accents":False,
    "stopwords": stop # set(stop)
}
f = lambda x: Preprocessing.preprocessing(x,params)
#%%


vectorizer = CountVectorizer(preprocessor = f,lowercase=False,token_pattern = Preprocessing.token_pattern)
Exemple #4
0
from utils.utils import Loader

fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8"
alltxts,alllabs = Loader.load_pres(fname)


fname = "Data/AFDpresidentutf8/corpus.tache1.test.utf8"
alltxts_test,alllabs_test = Loader.load_pres(fname)

'''
print(len(alltxts),len(alllabs))
print(alltxts[0])
print(alllabs[0])
print(alltxts[-1])
print(alllabs[-1])

path = "Data/AFDmovies/movies1000/"
alltxts,alllabs = Loader.load_movies(path)
'''


from sklearn.linear_model import LogisticRegression

stop = list(stopwords.words('english'))
stop = list(
    set(stop) - {
        "no", "not", "nor"
        'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
        'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
        "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
        "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
        "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't",
        'wouldn', "wouldn't", 'don', "don't", 'should', "should've"
    })

fname = "Data/AFDmovies/movies1000/"
alltxts, alllabs = Loader.load_movies(fname)
alltxts = np.array(alltxts)
alllabs = np.array(alllabs)

params = {
    # lowercase":[False,True],
    "punct": [False, True],
    # "marker":[False,True],
    # "number":[False,True],
    "stemming": [False, Preprocessing.stem_eng],  #,Preprocessing.stem],
    "ligne": [None, -2, 0],
    # "strip_accents":[False,True], #
    "stopwords": [None, stop],  # set(STOPWORDS)],
    "Vectorizer": [CountVectorizer, TfidfVectorizer],
    # "binary": [False,True],
    # "class_weight": [[0.1,1]],# ["balanced"],
Exemple #6
0
# -*- coding: utf-8 -*-
from utils.utils import Loader
from utils.preprocessing import Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from utils.oddsRatio import OddsRatioCloud
from time import time
from nltk.corpus import stopwords
from utils.scoring import get_vectorizer

fname = "Data/AFDmovies/movies1000/"
train_x, train_y = Loader.load_movies(fname)

stop = list(stopwords.words('english'))
stop = list(
    set(stop) - {
        "no", "not", "nor"
        'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
        'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
        "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
        "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn',
        "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't",
        'wouldn', "wouldn't", 'don', "don't", 'should', "should've"
    })

params = {
    "lowercase": False,
    "punct": False,
    # "marker":False,