def process_train_data(train): df = encode_onehot(train, cols=['Weekday', 'DepartmentDescription']) processed_df = DataFrame() # One Hot Encoding for features for k in df.keys(): if k.startswith('DepartmentDescription='): processed_df[k] = df.groupby(['VisitNumber'])[k].sum() if k.startswith('Weekday='): processed_df[k] = df.groupby(['VisitNumber'])[k].max() # Process all the class label to categorical values train['TripTypeStr'] = train['TripType'].apply(str) processed_train = encode_onehot(train, cols=['TripTypeStr']) cols_rename = {} for k in processed_train.keys(): if k.startswith('TripTypeStr='): cols_rename[k] = k.replace('Str=', '_') processed_train.rename(columns=cols_rename, inplace=True) for k in processed_train.keys(): if k.startswith('TripType_'): processed_df[k] = processed_train.groupby('VisitNumber')[k].max() return processed_df
def process_train_data(train): df = encode_onehot(train, cols=['Weekday', 'DepartmentDescription']) processed_df = DataFrame() # One Hot Encoding for features for k in df.keys(): if k.startswith('DepartmentDescription='): processed_df[k] = df.groupby(['VisitNumber'])[k].sum() if k.startswith('Weekday='): processed_df[k] = df.groupby(['VisitNumber'])[k].max() # Process all the class label to categorical values train['TripTypeStr'] = train['TripType'].apply(str) processed_train = encode_onehot(train, cols=['TripTypeStr']) cols_rename = {} for k in processed_train.keys(): if k.startswith('TripTypeStr='): cols_rename[k] = k.replace('Str=', '_') processed_train.rename(columns=cols_rename,inplace=True) for k in processed_train.keys(): if k.startswith('TripType_'): processed_df[k] = processed_train.groupby('VisitNumber')[k].max() return processed_df
def process_test_data(test_data): df = encode_onehot(test_data, cols=['Weekday', 'DepartmentDescription']) processed_df = DataFrame() # One Hot Encoding for features for k in df.keys(): if k.startswith('DepartmentDescription='): processed_df[k] = df.groupby(['VisitNumber'])[k].sum() if k.startswith('Weekday='): processed_df[k] = df.groupby(['VisitNumber'])[k].max() return processed_df
def generate_features(name): data = pandas.read_csv('homesite/{}.csv'.format(name)) processed_data = pandas.DataFrame() onehot_encode_fields = [] dropped_fields = [ 'PersonalField16', 'PersonalField17', 'PersonalField18', 'PersonalField19', ] for k in data.keys(): if is_number(data[k][1]): processed_data[k] = data[k] elif k == 'Original_Quote_Date': months = [] years = [] for d in data[k]: dt = datetime.datetime.strptime(d, '%Y-%m-%d') months.append(dt.month) years.append(dt.year) processed_data['Month'] = pandas.Series(months) processed_data['Year'] = pandas.Series(years) else: if k in dropped_fields: continue onehot_encode_fields.append(k) df = encode_onehot(data, cols=onehot_encode_fields) for k in df.keys(): if is_number(df[k][1]): print k processed_data[k] = df[k].apply( lambda x: locale.atof(x) if isinstance(x, basestring) else x) elif k == 'Original_Quote_Date': months = [] years = [] for d in df[k]: dt = datetime.datetime.strptime(d, '%Y-%m-%d') months.append(dt.month) years.append(dt.year) processed_data['Month'] = pandas.Series(months) processed_data['Year'] = pandas.Series(years) processed_data.to_csv('homesite/processed_features{}.csv'.format(name), quotechar='"')
def generate_features(name): data = pandas.read_csv("homesite/{}.csv".format(name)) processed_data = pandas.DataFrame() onehot_encode_fields = [] dropped_fields = ["PersonalField16", "PersonalField17", "PersonalField18", "PersonalField19"] for k in data.keys(): if is_number(data[k][1]): processed_data[k] = data[k] elif k == "Original_Quote_Date": months = [] years = [] for d in data[k]: dt = datetime.datetime.strptime(d, "%Y-%m-%d") months.append(dt.month) years.append(dt.year) processed_data["Month"] = pandas.Series(months) processed_data["Year"] = pandas.Series(years) else: if k in dropped_fields: continue onehot_encode_fields.append(k) df = encode_onehot(data, cols=onehot_encode_fields) for k in df.keys(): if is_number(df[k][1]): print k processed_data[k] = df[k].apply(lambda x: locale.atof(x) if isinstance(x, basestring) else x) elif k == "Original_Quote_Date": months = [] years = [] for d in df[k]: dt = datetime.datetime.strptime(d, "%Y-%m-%d") months.append(dt.month) years.append(dt.year) processed_data["Month"] = pandas.Series(months) processed_data["Year"] = pandas.Series(years) processed_data.to_csv("homesite/processed_features{}.csv".format(name), quotechar='"')
net_cfg['optimizer'][2]) net = Net(layer_list, cost_func, optimizer, one_hot=net_cfg['one_hot']) return net if __name__ == '__main__': cfg = util.config('config.1.json') one_hot = cfg['net']['one_hot'] x = util.read_data(cfg['dataset']['x_path']) y = util.read_data(cfg['dataset']['y_path']) # x, y = x[38:44], y[38:44] x_test_mg, x_test = util.generate_test_data(x) draw = draw.Draw(x, y, x_test_mg, one_hot=one_hot, C=cfg['class']) net = establish_net(cfg['net'], x) if one_hot: y = util.encode_onehot(y, cfg['class']) sample_num = x.shape[0] batch_size = cfg['batch_size'] for epoch in range(100000): cost_avg = 0 for it in range(sample_num // batch_size): beg_idx = it * batch_size end_idx = sample_num if beg_idx + batch_size > sample_num else beg_idx + batch_size x_train = x[beg_idx:end_idx] y_train = y[beg_idx:end_idx] cost = net.train(x_train, y_train) # if (epoch * (sample_num // batch_size) + it) % 50 == 0: # draw.drawLoss(epoch * (sample_num // batch_size) + it, cost) cost_avg += cost * (end_idx - beg_idx) y_train_ = net.predict(x_train) # print(y_train_.T)
start_idx = int(C.SR * 90) // C.H plot_length = C.SR * 20 // C.H feat,labs,aligns = dset[0] #_,feat_un,_,_ = dset_semi[0] plt.figure(model_name) plt.subplot(5,1,1) specshow(feat[start_idx:start_idx+plot_length,:24].T) plt.yticks(np.arange(24)+0.5,["C","","","","","F","","G","","","","","C","","","","","F","","G","","","",""],fontname="STIXGeneral") #plt.text(-25,6,"(a)",fontname="STIXGeneral",fontsize=15) dist_orig = dist.Bernoulli(feat[start_idx:start_idx+plot_length]) #plt.subplot(8,1,2) #labs_onehot = [U.encode_onehot(labs[aligns[:512],i],cat) for i,cat in zip(list(range(6)),[C.N_VOCABULARY_TRIADS,13,4,4,3,3])] labs_onehot = U.encode_onehot(labs[aligns[start_idx:start_idx+plot_length]],C.N_VOCABULARY_TRIADS) generated = model.generator.reconstr_dist([feat[start_idx:start_idx+plot_length]],[labs_onehot])[0] #generated = (generated.a/(generated.a+generated.b)).data #specshow(generated[:,:24].T) #plt.yticks(np.arange(24)+0.5,["C","","","","","F","","G","","","","","C","","","","","F","","G","","","",""],fontname="STIXGeneral") #plt.text(-25,6,"(c)",fontname="STIXGeneral",fontsize=15) #plt.subplot(8,1,3) #generated,lab_estimated = model.reconst(feat[start_idx:start_idx+plot_length]) #print("P_proposed= %.5f" % dist_orig.log_prob(generated).data.sum(-1).mean()) lab_estimated = model.estimate(feat)[start_idx:start_idx+plot_length] #generated = (generated.a/(generated.a+generated.b)).data #specshow(generated[:,:24].T) #plt.yticks(np.arange(24)+0.5,["C","","","","","F","","G","","","","","C","","","","","F","","G","","","",""],fontname="STIXGeneral") #plt.text(-25,6,"(c)",fontname="STIXGeneral",fontsize=15)
#Drop fields train.drop('ID', axis=1, inplace=True); train.drop(features_uniq_count, axis=1, inplace=True); train.drop(features_uniq_twice_removed, axis=1, inplace=True); train, date_columns_to_encode_ = util.createDateFeatures(train, date_columns_to_encode); train.drop(datecolumns, axis=1, inplace=True); #Drop dates train = util.binning(train, numerical_f, 0.5); train = util.binning(train, high_cardinal_categ_f, 0.5); #All features all_f = numerical_f + high_cardinal_categ_f + low_cardinal_categ_f + date_columns_to_encode_; #One hot encode train = util.encode_onehot(train, cols=all_f); #Order of columns train_cols = list(set(train.columns) - set('target')); #Read test test = pd.read_csv("/mnt/data/Springleaf/test.csv"); id_df = pd.DataFrame({'ID':test.ID}); id_df.to_csv("/mnt/data/Springleaf/test_ids.processed"); test.drop('ID', axis=1, inplace=True); test.drop(features_uniq_count, axis=1, inplace=True); test.drop(features_uniq_twice_removed, axis=1, inplace=True);
list_ninth = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] list_eleventh = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] list_thirteenth = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] list_labs = [ list_triad, list_bass, list_seventh, list_ninth, list_eleventh, list_thirteenth ] list_name = [ "C:maj", "C#:maj", "D:maj", "C:min", "C#:min", "D:min", "C:aug", "C#:aug", "D:aug", "C:dim", "C#:dim", "D:dim", "C:sus4", "C#:sus4", "D:sus4", "C:sus2", "C#:sus2", "D:sus2", "C:1", "C#:1", "D:1", "C:5", "C#:5", "D:5" ] for i in range(24): C.VAE_SHIFT_REGULAR = False #labs_onehot = [U.encode_onehot(np.ones(128,dtype=np.int32)*list_labs[j][i],cat) for j,cat in zip(list(range(6)),[C.N_VOCABULARY_TRIADS,13,4,4,3,3])] labs_onehot = U.encode_onehot( np.ones(10, dtype=np.int32) * list_triad[i], C.N_VOCABULARY_TRIADS) generated = model.generator.reconstr_dist([feat[689:699, :]], [labs_onehot])[0] #generated = (generated.a/(generated.a+generated.b)).data plt.subplot(4, 6, i + 1) plt.title(list_name[i], fontname="STIXGeneral") specshow(generated[:, :24].T) if i % 6 == 0: plt.yticks(np.arange(24) + 0.5, [ "C", "", "", "", "", "F", "", "G", "", "", "", "", "C", "", "", "", "", "F", "", "G", "", "", "", "" ], fontname="STIXGeneral")
import numpy as np import const as C import dataset as D import chainer from librosa.display import specshow import matplotlib.pyplot as plt import util as U dset = D.ChordDatasetSTFTSemisupervised([300]) #model = gen.GenerativeChordnet() model = vae.VUNetSTFT() model.load("chromavae.model") feat, labs, align = dset[0] labs_onehot = U.encode_onehot(labs) labs_onehot_blur = labs_onehot + 0.2 labs_onehot_blur /= labs_onehot_blur.sum(axis=1, keepdims=True) phase = dset.getstftphase(0) chainer.config.train = False chainer.config.enable_backprop = False generated = model.generate_encode_condition(feat[None, ...], labs_onehot[None, align, :])[0] #generated = model.reconst(feat) plt.subplot(3, 1, 1) specshow(feat[:1024, :64].T) plt.subplot(3, 1, 2)