Beispiel #1
0
def init(param):
    start_time = time.time()

    print 'Lecture des fichiers'
    installed_packages = pip.get_installed_distributions()
    installed_packages = [i.key for i in installed_packages]
    if "feather" in installed_packages == True:
        import transfo_data
        import feather
        if os.path.isfile("../Data/train_with_dest.feather")==False:
            print "Appel a transfo_data."
            transfo_data.transfo_data("../Data/train_with_dest")
        if os.path.isfile("../Data/test_with_dest.feather")==False:
            print "Appel a transfo_data."
            transfo_data.transfo_data("../Data/test_with_dest")

        train = feather.read_dataframe('../Data/train_with_dest.feather')
        test  = feather.read_dataframe('../Data/test_with_dest.feather')

    else:
        train = pd.read_csv('../Data/train_with_dest.csv', header=0)
        test = pd.read_csv('../Data/test_with_dest.csv', header=0)
    util.print_time(start_time)
    
    print 'Feature engineering' 
    train,test,train_eval,test_eval = util.adaptData(train,test)
    util.print_time(start_time)
    print '{} lignes selectionnees'.format(len(train))
    print '{} variables : {}'.format(len(param),param)
    
    return start_time,train_eval,test_eval
Beispiel #2
0
def principal(chemin, start_time, liste_coefficient, fonction):
    data = ouverture_fichier('../Data/fichier_resultat.csv')
    ecriture = ecriture_fichier(chemin)
    print "fichier de resultat ouvert et fichier d'ecriture cree"
    util.print_time(start_time)
    nb_lignes = 0
    print "traitement des lignes en cours"
    while nb_lignes < len(data):
        prediction = traitement_ligne(data[nb_lignes], liste_coefficient,
                                      fonction)
        ecriture.writerow([data[nb_lignes][0], prediction])
        nb_lignes = nb_lignes + 1
    print "termine"
    util.print_time(start_time)
Beispiel #3
0
def concatenation_fichiers(liste_algo, start_time):
    liste_df = liste_algo
    i = 0
    while i < len(liste_algo):
        nom = pd.read_csv(liste_algo[i], header=0)
        liste_df[i] = pd.DataFrame(nom)
        i = i + 1

    final = pd.merge(liste_df[0], liste_df[1], on='id')
    i = 2
    while i < len(liste_algo):
        final = pd.merge(final, liste_df[i], on='id')
        i = i + 1

    del final["id"]
    final.to_csv('../Data/fichier_resultat.csv')
    print "fichier recapitulatif cree"
    util.print_time(start_time)
Beispiel #4
0
def best_formula(param,model):
    
    start_time,train_eval,test_eval = init(param)
    
    loop = float(len(param)*(len(param)-1))/2.;
    print "{} calculs au maximum".format(int(loop))
    
    print 'initialisation'
    best_subset = param
    best_score = util.temporal_validation(train_eval,test_eval,model,param)
    first_score = best_score
    util.print_time(start_time)
    print 'score obtenu : {}'.format(best_score)
    
    current = 1
    while len(best_subset)>1:
        print '\n-> etape {}'.format(current)
        current += 1
        subset_list = find_subset(best_subset)
        tem_val = Parallel(n_jobs=-1,verbose=50)(delayed(util.temporal_validation)(pd.DataFrame.copy(train_eval),pd.DataFrame.copy(test_eval),model,subset) for subset in subset_list)
        util.print_time(start_time)
        argmax = np.argmax(tem_val)  
        if tem_val[argmax]<best_score:
            break
        best_subset = subset_list[argmax]
        diff = tem_val[argmax]-best_score
        best_score = tem_val[argmax]
        print 'score actuel : {}'.format(best_score)
        print 'amelioration de {}'.format(diff)
        print 'parametres actuels : {}'.format(best_subset)
        
    print("")
    util.print_time(start_time)
    print("Resultat :")
    print "Meilleur taux obtenu : {}".format(best_score)
    print 'amelioration totale de {}'.format(best_score-first_score)
    print("pour les parametres suivants :")
    print (best_subset)
    
    return
Beispiel #5
0
def predictor(label, model, path, gen_csv=True):

    start_time = time.time()

    print 'Lecture des fichiers'
    installed_packages = pip.get_installed_distributions()
    installed_packages = [i.key for i in installed_packages]
    if "feather" in installed_packages == True:
        import transfo_data
        import feather
        if os.path.isfile("../Data/train_with_dest.feather") == False:
            print "Appel a transfo_data."
            transfo_data.transfo_data("../Data/train_with_dest")
        if os.path.isfile("../Data/test_with_dest.feather") == False:
            print "Appel a transfo_data."
            transfo_data.transfo_data("../Data/test_with_dest")
        if os.path.isfile("../Data/test_data_leak.feather") == False:
            print "Appel a transfo_data."
            transfo_data.transfo_data("../Data/test_data_leak")

        train = feather.read_dataframe('../Data/train_with_dest.feather')
        test = feather.read_dataframe('../Data/test_with_dest.feather')
        leak = feather.read_dataframe('../Data/test_data_leak.feather')

    else:
        train = pd.read_csv('../Data/train_with_dest.csv', header=0)
        test = pd.read_csv('../Data/test_with_dest.csv', header=0)
        leak = pd.read_csv('../Data/test_data_leak.csv', header=0)
    util.print_time(start_time)

    print 'Feature engineering'
    train, test, train_eval, test_eval = util.adaptData(train, test, label)
    util.print_time(start_time)
    print '{} lignes selectionnees'.format(len(train))
    print '{} variables : {}'.format(len(label), label)

    print 'Validation temporelle'
    taux = util.temporal_validation(train_eval, test_eval, model, label)
    util.print_time(start_time)
    print "Taux de predictions correctes pour les parametres actuels : {0}".format(
        taux)

    if gen_csv:
        print 'Apprentissage & Prediction'
        prediction, modele = model(train, test, label)

        #         print 'Exportation du modele conserve'
        #         nom_export = "{0}.bin".format(modele).split("(")[0]
        #         joblib.dump(modele,nom_export)
        #         util.print_time(start_time)

        print 'Data leak'
        for i in range(len(leak)):
            prediction[leak['id'][i]] = util.leak_fusion(
                leak['hotel_cluster'][i], prediction[leak['id'][i]])

        test['hotel_cluster'] = util.to_string(prediction)
        util.print_time(start_time)

        print 'Generation du fichier csv'
        test[["id", "hotel_cluster"]].to_csv(path, index=False)
        util.print_time(start_time)
    print 'termine'

    return taux
Beispiel #6
0
import pandas as pd
import utilitaires as util
import time

#TODO: A DEBUGGER

name = 'gradb'

start_time = time.time()

print 'Lecture des fichiers'
pred = pd.read_csv('../Predictions/{}.csv'.format(name), header=0)
leak = pd.read_csv('../Data/test_data_leak.csv', header=0)
util.print_time(start_time)

print 'Data leak'
for i in range(len(leak)):
    pred.loc[leak['id'][i], 'hotel_cluster'] = leak['hotel_cluster'][i]
util.print_time(start_time)

print 'Generation du fichier csv'
pred[["id",
      "hotel_cluster"]].to_csv('../Predictions/{}_data_leak.csv'.format(name),
                               index=False)
util.print_time(start_time)
print 'termine'