Exemple #1
0
def index_learning(dftrain,rayon,cat1,F1score=0.8):
    # select Categorie1 from rayon and training set
    ry = rayon[rayon.Categorie1 == cat1].copy()
    df = dftrain[dftrain.Categorie1 == cat1].copy()
    ry['txt'] = map(normalize_txt,ry.Categorie3_Name)
    add_txt(df)
    # vectorize Categorie3_Name as index
    vec = TfidfVectorizer(stop_words = None,min_df = 1,max_features = None,smooth_idf=True,norm='l2',sublinear_tf=False,use_idf=True,ngram_range=(1,3))
    Xr = vec.fit_transform(ry.txt)
    Xt = vec.transform(df.txt)
    # compute distance from sample to index
    D = pairwise_distances(Xt,Xr,metric='cosine')
    a = np.argmin(D,axis=1)
    df['D'] = D[range(len(a)),a]
    df['guess'] = ry.Categorie3.values[a]
    Dmin = {}
    for d in np.linspace(0,1,21):
        Yr = df[df.D<d].guess
        Yt = df[df.D<d].Categorie3
        fs = f1_score(Yt,Yr,labels=ry.Categorie3,average=None)
        for i in np.nonzero(fs > F1score)[0]:
            cat3 = ry.Categorie3.values[i]
            if cat3 in Dmin:
                continue
            Dmin[cat3] = d
    joblib.dump((vec,Dmin,F1score),ddir+'joblib/index_'+str(cat1))
    del ry,df
    return vec,Dmin,F1score
Exemple #2
0
def index_guessing(dfsample,rayon,cat1,vec,Dmin,default=None):
    if 'guess' not in dfsample.columns:
        dfsample['guess'] = None
    df = dfsample[dfsample.Categorie1 == cat1].copy()
    if len(df)==0:
        return []
    ry = rayon[rayon.Categorie1 == cat1].copy()
    add_txt(df)
    ry['txt'] = map(normalize_txt,ry.Categorie3_Name)
    Xr = vec.transform(ry.txt)
    Xt = vec.transform(df.txt)
    D = pairwise_distances(Xt,Xr,metric='cosine')
    a = np.argmin(D,axis=1)
    df['guess'] = ry.Categorie3.values[a]
    df['D'] = D[range(len(a)),a]
    return [r.guess if r.D<Dmin.get(r.guess,0) else None for i,r in df.iterrows()]
    print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv
    joblib.dump((labels,vec,cla),fname)
    del vec,cla
    return (sct,scv)

#######################
# training
# stage1 : Categorie1 
# stage3 : Categorie3|Categorie1
#######################

dftrain = pd.read_csv(ddir+'training_sample.csv'+ext,sep=';',names = header()).fillna('')
dfvalid = pd.read_csv(ddir+'validation_sample.csv'+ext,sep=';',names = header()).fillna('')
dftest = pd.read_csv(ddir+'test_normed.csv',sep=';',names = header(test=True)).fillna('')

add_txt(dftrain)
add_txt(dfvalid)
add_txt(dftest)

dftrain = dftrain[['Categorie3','Categorie1','txt']]
dfvalid = dfvalid[['Categorie3','Categorie1','txt']]
dftest = dftest[['Identifiant_Produit','txt']]


# training stage1

dt = -time.time()
sct,scv = training_stage1(dftrain,dfvalid)
dt += time.time()

print '##################################'
Exemple #4
0
from sklearn.externals import joblib
from utils import itocat1,itocat2,itocat3
from utils import cat1toi,cat2toi,cat3toi
from utils import cat3tocat2,cat3tocat1,cat2tocat1
from utils import cat1count,cat2count,cat3count
import sys

ddir = '/home/ngaude/workspace/data/cdiscount/'

assert len(sys.argv) == 2  ##### usage guess.py $RESULTAT.CSV ####
rname  = sys.argv[1]
assert isfile(ddir+rname) ##### usage guess.py $RESULTAT.CSV ####


test_normed = pd.read_csv(ddir+'test_normed.csv',sep=';',names=header(True)).fillna('')
add_txt(test_normed)
test_num_word = map(lambda t:len(set(t.split())),test_normed.txt)

test_nn = pd.read_csv(ddir+'test_nn.csv',sep=';').fillna('')
test_nn['Marque'] = test_nn.Marque_nn
test_nn['Libelle'] = test_nn.Libelle_nn
test_nn['Description'] = test_nn.Description_nn
add_txt(test_nn)
nn_num_word = map(lambda t:len(set(t.split())),test_nn.txt)
test_nn.drop('Marque', axis=1, inplace=True)
test_nn.drop('Libelle', axis=1, inplace=True)
test_nn.drop('Description', axis=1, inplace=True)

best = pd.read_csv(ddir+rname,sep=';')
#best = pd.read_csv('proba.auto.merging.60.csv',sep=';')
#best.Id_Categorie = 1000015309
Exemple #5
0
from sklearn.externals import joblib
from utils import itocat1, itocat2, itocat3
from utils import cat1toi, cat2toi, cat3toi
from utils import cat3tocat2, cat3tocat1, cat2tocat1
from utils import cat1count, cat2count, cat3count
import sys

ddir = "/home/ngaude/workspace/data/cdiscount/"

assert len(sys.argv) == 2  ##### usage guess.py $RESULTAT.CSV ####
rname = sys.argv[1]
assert isfile(ddir + rname)  ##### usage guess.py $RESULTAT.CSV ####


test_normed = pd.read_csv(ddir + "test_normed.csv", sep=";", names=header(True)).fillna("")
add_txt(test_normed)
test_num_word = map(lambda t: len(set(t.split())), test_normed.txt)

test_nn = pd.read_csv(ddir + "test_nn.csv", sep=";").fillna("")
test_nn["Marque"] = test_nn.Marque_nn
test_nn["Libelle"] = test_nn.Libelle_nn
test_nn["Description"] = test_nn.Description_nn
add_txt(test_nn)
nn_num_word = map(lambda t: len(set(t.split())), test_nn.txt)
test_nn.drop("Marque", axis=1, inplace=True)
test_nn.drop("Libelle", axis=1, inplace=True)
test_nn.drop("Description", axis=1, inplace=True)

best = pd.read_csv(ddir + rname, sep=";")
# best = pd.read_csv('proba.auto.merging.60.csv',sep=';')
# best.Id_Categorie = 1000015309
Exemple #6
0
@author: ngaude
"""

from utils import header,add_txt
import numpy as np
import pandas as pd
from sklearn.externals import joblib

from utils import itocat1,itocat3
from utils import cat1count,cat2count,cat3count
import time

ddir = '/home/ngaude/workspace/data/cdiscount.proba/' 

dftest = pd.read_csv(ddir+'test_normed.csv',sep=';',names = header(test=True)).fillna('')
add_txt(dftest)
dftest = dftest[['Identifiant_Produit','txt']]

stage3_proba_test = np.full(shape=(len(dftest),cat3count),fill_value = 0.,dtype = float)
stage1_proba_test = np.full(shape=(len(dftest),cat1count),fill_value = 0.,dtype = float)

def submit(df,Y):
    submit_file = ddir+'resultat.auto.merging.'+str(N)+'.csv'
    df['Id_Produit']=df['Identifiant_Produit']
    df['Id_Categorie'] = Y
    df= df[['Id_Produit','Id_Categorie']]
    df.to_csv(submit_file,sep=';',index=False)

def save_proba(df,Y,p1,p3):
    submit_file = ddir+'proba.auto.merging.'+str(N)+'.csv'
    df['Id_Produit']=df['Identifiant_Produit']
Exemple #7
0
        scv = (-1,0)
    else:
        # performs a gridsearch
        Xvs = [ vec.transform(dfv.txt) for dfv in dfvs]
        Yvs = [ dfv['Categorie3'].values for dfv in dfvs]
        cla,scv = best_classifier(X,Y,Xvs,Yvs)
    print 'training',cat1,'\t\t(',i,') N=',len(dft),'K=',len(labels),': mean =',scv[0],'dev=',scv[1]
    joblib.dump((labels,vec,cla,scv),fname+ext)
    del vec,cla
    return scv

#################
# prepare train #
#################
dftrain = pd.read_csv(ddir+'training_random.csv'+ext,sep=';',names = header()).fillna('')
add_txt(dftrain)
dftrain = dftrain[['Categorie3','Categorie1','txt']]

#################
# prepare valid #
#################
dfvs = [pd.read_csv(ddir+'validation_random.csv.'+str(i),sep=';',names = header()).fillna('') for i in range(9)]
for i in range(9):
    add_txt(dfvs[i])
    dfvs[i] = dfvs[i][['Identifiant_Produit','Categorie3','Categorie1','txt']]

#################
# prepare test  #
#################

for i,cat1 in enumerate(np.unique(dftrain.Categorie1)):
Exemple #8
0
assert len(sys.argv) == 2  ##### usage guess.py $PROBA.CSV ####
assert isfile(sys.argv[1]) ##### usage guess.py $PROBA.CSV ####

pname  = sys.argv[1]
# pname = 'proba.auto.merging.15.csv'
pdir = dirname(pname)
##################
# FIXME : ensure that confidence level are the same between logistic regression proba et guessing proba
# proba_score = 0.6768667
# <==>
# sum(proba.Proba_Categorie3)/len(df) # 0.7525964785959941
##################

rayon = pd.read_csv(ddir+'rayon.csv',sep=';')
test = pd.read_csv(ddir+'test_normed.csv',sep=';',names = header(True)).fillna('')
add_txt(test)
proba = pd.read_csv(pname,sep=';')
df = test.merge(proba,'left',None,'Identifiant_Produit','Id_Produit')
df = df.merge(rayon,'left',None,'Id_Categorie','Categorie3')

rg = pd.read_csv(ddir+'rayon_guessing.csv',sep=';')
g = rg.groupby('Categorie1')

guess_correction = 0
num_correction = 0

best_Categorie3 = df.Categorie3.values
#best_Categorie3 = [1000015309]*len(df)

for i,r in df.iterrows():
    rdf = g.get_group(r.Categorie1)
Exemple #9
0
def bayes_prediction(stage1_log_proba,stage3_log_proba):
    for i in range(stage3_log_proba.shape[1]):
        cat3 = itocat3[i]
        cat1 = cat3tocat1[cat3]
        j = cat1toi[cat1]
        stage3_log_proba[:,i] += stage1_log_proba[:,j]

bayes_prediction(stage1_log_proba_valid,stage3_log_proba_valid)

predict_cat3_valid = [itocat3[i] for i in np.argmax(stage3_log_proba_valid,axis=1)]
proba_cat3_valid =  np.exp(np.max(stage3_log_proba_valid,axis=1))

valid['Categorie3_lr'] = predict_cat3_valid
valid['proba_lr'] = proba_cat3_valid
add_txt(valid)

#############################################
# get results from a previously trained 
# logistic regression model
#############################################

# head = pd.read_csv(ddir+'training_head.csv',names=header(),sep=';').fillna('')
# head = head[head.Produit_Cdiscount == 1]
# add_txt(head)
# head.to_csv(ddir+'nn_train.csv',sep=';',index=False)

train = pd.read_csv(ddir+'nn_train.csv',sep=';')

#############################################
# vectorize the full text                   #
Exemple #10
0
    dfsample = pd.concat(dfs)
    dfsample = dfsample.reset_index(drop=True)
    dfsample = dfsample.reindex(np.random.permutation(dfsample.index),
                                copy=False)
    return dfsample


##################
# VECTORIZING
##################

# vectorize dftest
dftest = pd.read_csv(ddir + 'test_normed.csv',
                     sep=';',
                     names=header(test=True)).fillna('')
add_txt(dftest)
vec, Xtest = vectorizer(dftest.txt)

# vectorize dftrain
dftrain = pd.read_csv(ddir + 'training_shuffled_normed.csv',
                      sep=';',
                      names=header()).fillna('')
add_txt(dftrain)
Ytrain = dftrain.Categorie3.values.copy()
IDtrain = dftrain.Identifiant_Produit.values.copy()

# NOTE : memory error work around...

# let's serialize.

joblib.dump((vec, IDtrain, Ytrain), '/tmp/vecIDYtrain')
            # sample all samples + oversample the remaining
            dfs.append(df)
            df = df.iloc[np.random.randint(0, len(df), size=sample_count-len(df))]
            dfs.append(df)
    dfsample = pd.concat(dfs)
    dfsample = dfsample.reset_index(drop=True)
    dfsample = dfsample.reindex(np.random.permutation(dfsample.index),copy=False)
    return dfsample

##################
# VECTORIZING
##################

# vectorize dftest
dftest = pd.read_csv(ddir+'test_normed.csv',sep=';',names = header(test=True)).fillna('')
add_txt(dftest)
vec,Xtest = vectorizer(dftest.txt)

# vectorize dftrain
dftrain = pd.read_csv(ddir+'training_shuffled_normed.csv',sep=';',names = header()).fillna('')
add_txt(dftrain)
Ytrain = dftrain.Categorie3.values.copy()
IDtrain = dftrain.Identifiant_Produit.values.copy()


# NOTE : memory error work around... 

# let's serialize.

joblib.dump((vec,IDtrain,Ytrain),'/tmp/vecIDYtrain')
joblib.dump(Xtest,ddir+'joblib/Xtest')