Example #1
0
def training_sample_adasyn(df,n = 200,mincount=7):
    (X,Y) = rf_vectorizer(df)
    Xt = []
    Yt = []
    for i,cat in enumerate(np.unique(df.Y)):
        print 'adasyn :',i
        Xt.append(adasyn_sample(X,Y,cat,K=5,n=n))
        Yt.append([cat,]*Xt[-1].shape[0])
    Xt = np.vstack(Xt) 
    Yt = np.concatenate(Yt)
    shuffle = np.random.permutation(len(Yt))
    Xt = Xt[shuffle,:]
    Yt = Yt[shuffle]
    return Xt,Yt
Example #2
0
def training_sample_adasyn(df,vec,N = 200,mincount=7):
    X = vec.transform(df.txt)
    Y = df.Categorie3.values
    Xt = []
    Yt = []
    for i,cat in enumerate(np.unique(Y)):
        print 'adasyn :',i
        Xt.append(adasyn_sample(X,Y,cat,K=5,n=N))
        Yt.append([cat,]*Xt[-1].shape[0])
    Xt = np.vstack(Xt) 
    Yt = np.concatenate(Yt)
    shuffle = np.random.permutation(len(Yt))
    Xt = Xt[shuffle,:]
    Yt = Yt[shuffle]
    return Xt,Yt
Example #3
0
dfsample.to_csv(ddir+'training_sup9.csv',sep=';',index=False,header=False)

Y = dfsample.Categorie3.values
ID = dfsample.Identifiant_Produit.values
print 'vectorizing...'
vec,X = vectorizer(dfsample.txt)
print 'dumping...'
joblib.dump((vec,ID,X,Y),ddir+'joblib/vecIDXY')

# use adasyn to get synthetic balanced dataset

Xt = []
Yt = []
for i,cat in enumerate(np.unique(Y)):
    print 'adasyn :',i
    Xt.append(adasyn_sample(X,Y,cat,K=5,n=200))
    Yt.append([cat,]*Xt[-1].shape[0])

Xt = sparse.vstack(Xt) 
assert Xt.shape[0] == len(Yt)
rows = random.sample(Xt,Xt.shape[0])
Xt = Xt[rows]
joblib.dump((vec,Xt,Yt),ddir+'joblib/vecXtYt')

#################################################
# TRAINING START HERE
#################################################


(vec,X,Y) = joblib.load(ddir+'joblib/vecXtYt_200')
Z = np.array(map(lambda c:cat3tocat1[c],Y))