Esempio n. 1
0
def make_sparse(test_df,train_df,factors,non_factors,cut_off,data_path):
	if not os.path.exists(data_path):
		os.makedirs(data_path)
	#######################
	# CREATING SVM FORMAT #
	#######################
	tmp_df  =train_df[:]
	tmp_df_1=tmp_df[:]
	tmp_df_1['click_flag']=1
	tmp_df_1['ais']=tmp_df_1['clicks']
	
	tmp_df_0=tmp_df[:]
	tmp_df_0['click_flag']=0
	tmp_df_0['ais']=tmp_df_1['instances']-tmp_df_1['clicks']
	
	train_df=tmp_df_0.append(tmp_df_1)
	train_df=train_df.drop('clicks',1)
	train_df=train_df.drop('instances',1)
	train_df.rename(columns={'click_flag': 'clicks'}, inplace=True)


	tmp_df  =test_df[:]
	tmp_df_1=tmp_df[:]
	tmp_df_1['click_flag']=1
	tmp_df_1['ais']=tmp_df_1['clicks']
	
	tmp_df_0=tmp_df[:]
	tmp_df_0['click_flag']=0
	tmp_df_0['ais']=tmp_df_1['instances']-tmp_df_1['clicks']
	
	test_df=tmp_df_0.append(tmp_df_1)
	test_df=test_df.drop('clicks',1)
	test_df=test_df.drop('instances',1)
	test_df.rename(columns={'click_flag': 'clicks'}, inplace=True)
	
	sc=gen_features.SparseCat(factors,non_factors)
	sc.set_params(count_cutoff=cut_off)
	sc.fit_weighted(train_df,train_df['ais'])
	
	f = file(data_path+'train_SC', 'wb')
	pkl.dump(sc,f,protocol=pkl.HIGHEST_PROTOCOL)
	f.close()
	mad_sparse_train=sc.transform(train_df)
	np.savetxt(data_path+'train_ais.txt', np.array(train_df['ais']), fmt='%d')
	gen_features.csr_write_libsvm(data_path+'train_svm.txt',mad_sparse_train, train_df['clicks'], len(factors)+len(non_factors))
	
	mad_sparse_test=sc.transform(test_df)
	np.savetxt(data_path+'test_ais.txt', np.array(test_df['ais']), fmt='%d')
	gen_features.csr_write_libsvm(data_path+'test_svm.txt',mad_sparse_test, test_df['clicks'], len(factors)+len(non_factors))
			
	return test_df,train_df
Esempio n. 2
0
    def make_sparse(df_train,W_train,df_val,W_val,df_test,W_test,file_name,cut_off,factors):
        df_train_y=df_train['clicks'].values.astype(np.int8).squeeze()
        #factors=factors_arr#,'ad_id','site_id*ad_id']#,'ad*country','ad*device','site*country'
        non_factors=[]
        
        sc=gen_features.SparseCat(factors,non_factors)
        sc.set_params(count_cutoff=cut_off)
        sc.fit_weighted(df_train,W_train)
        
        f = file(file_name+'train_SC', 'wb')
        pkl.dump(sc,f,protocol=pkl.HIGHEST_PROTOCOL)
        f.close()
       
        mad_sparse_train=sc.transform(df_train)
        np.savetxt(file_name+'train_samples.txt', np.array(df_train['samples']), fmt='%d')
        gen_features.csr_write_libsvm(file_name+'train_svm.txt',mad_sparse_train, df_train_y, len(factors)+len(non_factors))
        
        if df_val.shape[0] != 0:
            df_val_y=df_val['clicks'].values.astype(np.int8).squeeze()
            mad_sparse_val=sc.transform(df_val)
            np.savetxt(file_name+'val_samples.txt', np.array(df_val['samples']), fmt='%d')
            gen_features.csr_write_libsvm(file_name+'val_svm.txt',mad_sparse_val, df_val_y, len(factors)+len(non_factors))
            sc_val=gen_features.SparseCat(factors,non_factors)
            sc_val.set_params(count_cutoff=cut_off)
            sc_val.fit_weighted(df_val,W_val)
            f = file(file_name+'val_SC', 'wb')
            pkl.dump(sc_val,f,protocol=pkl.HIGHEST_PROTOCOL)
            f.close()
        
        df_test_y=df_test['clicks'].values.astype(np.int8).squeeze()
        mad_sparse_test=sc.transform(df_test)
        np.savetxt(file_name+'test_samples.txt', np.array(df_test['samples']), fmt='%d')
        gen_features.csr_write_libsvm(file_name+'test_svm.txt',mad_sparse_test, df_test_y, len(factors)+len(non_factors))
        
        sc=gen_features.SparseCat(factors,non_factors)
        sc.set_params(count_cutoff=cut_off)
        sc.fit_weighted(df_train,W_train)
								
        sc_test=gen_features.SparseCat(factors,non_factors)
        sc_test.set_params(count_cutoff=cut_off)
        sc_test.fit_weighted(df_test,W_test)
        f = file(file_name+'test_SC', 'wb')
        pkl.dump(sc_test,f,protocol=pkl.HIGHEST_PROTOCOL)
        f.close()
Esempio n. 3
0
factors=['site_id']#,'ad*country','ad*device','site*country'
non_factors=['views']

sc=gen_features.SparseCat(factors,non_factors)
sc.set_params(count_cutoff=25)
t1=time.time()
sc.fit(mad,mad_y)
t2=time.time()
print (time.time()-t1)/60

# can also do interactions
#factors_cross=['ad','campaign','account','site','country','device','ad*site','ad*country','ad*device','site*country']
#sc_cross=gen_features.SparseCat(factors_cross,non_factors)
#sc_cross.set_params(count_cutoff=25)

mad_sparse=sc.transform(mad)
f = file(data_path+'sc', 'wb')
pkl.dump(sc,f,protocol=pkl.HIGHEST_PROTOCOL)
f.close()

# different ways of saving the sparse matrix - using sklearn
joblib.dump(mad_sparse, data_path+output_file_name+'_sparse.pkl')
# gen_features.save_sparse("mad_sparse",mad_sparse) # or numpy as npz file

# save clicks
np.savez(data_path+output_file_name+'_y',mad_y=mad_y)

mad['samples'].to_csv(data_path+output_file_name+'_samples.csv',header=['sample'])

gen_features.csr_write_libsvm(data_path+output_file_name+'_svm.txt',mad_sparse, mad_y, len(factors)+len(non_factors))
#dump_svmlight_file(mad_sparse, mad_y, '../data/mad_svm.txt', zero_based=True, comment=None, query_id=None)