def train_convolution_network(x): prediction = cnn(x) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y)) optimizer = tf.train.AdamOptimizer().minimize(cost) hm_epochs = 10 with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for epoch in range(hm_epochs): epoch_loss = 0 for i in range(int(len(train_x) / batch_size)): epoch_x, epoch_y = next_batch(batch_size) i, c = sess.run([optimizer, cost], feed_dict={ x: epoch_x, y: epoch_y }) epoch_loss += c print('Epoch', epoch, 'completed out of', hm_epochs, 'loss:', epoch_loss) correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct, 'float')) print('Accuracy:', accuracy.eval({ x: test_x, y: categorical(test_y[:, 0], drop=True) })) #categorical
def load(): """ Loads the Grunfeld data and returns a Dataset class. Returns ------- Dataset instance: See DATASET_PROPOSAL.txt for more information. Notes ----- raw_data has the firm variable expanded to dummy variables for each firm (ie., there is no reference dummy) """ filepath = dirname(abspath(__file__)) data = recfromtxt(open(filepath + '/grunfeld.csv', 'rb'), delimiter=",", names=True, dtype="f8,f8,f8,a17,f8") names = list(data.dtype.names) endog = array(data[names[0]], dtype=float) endog_name = names[0] exog = data[list(names[1:])] exog_name = list(names[1:]) dataset = Dataset(data=data, names=names, endog=endog, exog=exog, endog_name=endog_name, exog_name=exog_name) raw_data = categorical(data, col='firm', drop=True) dataset.raw_data = raw_data return dataset
def load(): """ Loads the Grunfeld data and returns a Dataset class. Returns ------- Dataset instance: See DATASET_PROPOSAL.txt for more information. Notes ----- raw_data has the firm variable expanded to dummy variables for each firm (ie., there is no reference dummy) """ filepath = dirname(abspath(__file__)) data = recfromtxt(open(filepath + '/grunfeld.csv','rb'), delimiter=",", names=True, dtype="f8,f8,f8,a17,f8") names = list(data.dtype.names) endog = array(data[names[0]], dtype=float) endog_name = names[0] exog = data[list(names[1:])] exog_name = list(names[1:]) dataset = Dataset(data=data, names=names, endog=endog, exog=exog, endog_name=endog_name, exog_name=exog_name) raw_data = categorical(data, col='firm', drop=True) dataset.raw_data = raw_data return dataset
def next_batch(batch_size): global test_counter batch_counter = test_counter #global test_counter test_counter = (batch_counter + 1) return train_x[batch_counter:(batch_counter + batch_size), :], categorical( train_y[batch_counter:(batch_counter + batch_size)][:, 0], drop=True)
def next_batch(batch_size): global test_counter batch_counter=test_counter #global test_counter test_counter=(batch_counter+1) # vivek: you should verify the outcome of the following code return train_x[batch_counter:(batch_counter+batch_size),:],categorical(train_y[batch_counter:(batch_counter+batch_size)][:,0],drop=True) #categorical
def testScale(): ## need sakjf;sajfd;aslkd x = np.array([[1, 2, 1, 0, 2]]) # values of var1 of all samples print x.shape y_arr = np.array([['T', 'F2', 'F2', 'T', 'F1']]) z = np.array([[1, 2, 2, 1, 2]]) min_max_scaler = preprocessing.MinMaxScaler().fit(z) print z x_train_minmax = min_max_scaler.fit_transform(z) cat_arr = categorical(y_arr, drop=True) ## each row belong to one sample cat_arr_trans = np.transpose(cat_arr) ## each row = 1 var data_temp = np.array([x_train_minmax[0]]) print x_train_minmax[0] '''
def load(): """ Loads the Grunfeld data and returns a Dataset class. Returns ------- Dataset instance: See DATASET_PROPOSAL.txt for more information. Notes ----- raw_data has the firm variable expanded to dummy variables for each firm (ie., there is no reference dummy) """ data = _get_data() raw_data = categorical(data, col='firm', drop=True) ds = du.process_recarray(data, endog_idx=0, stack=False) ds.raw_data = raw_data return ds
def imputeMissingValue(path, fname, att_name_list, category_arr_list): imp = Imputer(missing_values='NaN', strategy='mean', axis=1) att_value_hash = dict() for name in att_name_list: att_value_hash[name] = [] f_r = open(path + fname, "r") for line in f_r.readlines()[1::]: att_arr = line.strip().split(",") for val, att_name in zip(att_arr, att_name_list): val = val.strip() if val == "None" or val == "NA": val = np.nan att_value_hash[att_name].append(val) b = att_value_hash[att_name_list[1]] imp.fit(b) impb = imp.transform(b) data = impb #print original_data.shape for att_name, val_arr in att_value_hash.items(): if att_name == "ID": continue b = np.array([att_value_hash[att_name]]) #print att_name if att_name in category_arr_list: cat_matrix, cat_dict = categorical(b, drop=True, dictnames=True) inv_dict = {v: k for k, v in cat_dict.items()} newb = np.array([[inv_dict[d] for d in b[0]]]) #print newb else: newb = b imp.fit(newb) impb = imp.transform(newb) data = np.concatenate((data, impb), axis=0) return data, att_value_hash
def test_structarray1d(self): instr = self.structdes['instrument'].view(dtype=[('var1', 'f4')]) dum = tools.categorical(instr) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names[-5:]])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 6)
def test_recarray1d_drop(self): instr = self.structdes['instrument'].view(np.recarray) dum = tools.categorical(instr, drop=True) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 5)
def test_array1d(self): des = tools.categorical(self.instr) assert_array_equal(des[:,-5:], self.dummy) assert_equal(des.shape[1],6)
def test_structarray2d_drop(self): des = tools.categorical(self.structdes, col='str_instr', drop=True) test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]])) assert_array_equal(test_des, self.dummy) assert_equal(len(des.dtype.names), 8)
def test_recarray1d(self): instr = self.structdes['str_instr'].view(np.recarray) dum = tools.categorical(instr) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names[-5:]])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 6)
# df_train=pd.read_csv('matrix_nn_train.csv', sep=',') # df_test=pd.read_csv('matrix_nn_test.csv',sep=',') df_train = pd.read_csv("train.csv", sep=",") df_test = pd.read_csv("test.csv", sep=",") ind_train = len(df_train.index) col = len(df_train.columns) ind_test = len(df_test.index) input_train = np.array(df_train.iloc[:, 1:col]) tmp1 = np.array(df_train.iloc[:, 0]) target_train = categorical(tmp1, drop=True).argmax(1) print input_train.shape input_test = np.array(df_test.iloc[:, 1:col]) tmp2 = np.array(df_test.iloc[:, 0]) target_test = categorical(tmp2, drop=True).argmax(1) print input_test.shape print "...building the instances..." train = [Instance(input_train[i], [target_train[i]]) for i in range(ind_train)] test = [Instance(input_test[i], [target_test[i]]) for i in range(ind_test)] n_inputs = col - 1 n_outputs = 1 n_hiddens = 300
def test_array1d_drop(self): des = tools.categorical(self.string_var, drop=True) assert_array_equal(des, self.dummy) assert_equal(des.shape[1],5)
def test_recarray2d(self): des = tools.categorical(self.recdes, col='str_instr') # better way to do this? test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]])) assert_array_equal(test_des, self.dummy) assert_equal(len(des.dtype.names), 9)
ccc['droprate_uc'] = aaa['drop']#username_countごとのdropout数 dtrain = dtrain.reset_index().merge(ccc,on='username_count',how='left').sort('index').drop('index',axis=1) dtest = dtest.reset_index().merge(ccc,on='username_count',how='left').sort('index').drop('index',axis=1) #dtrain = dtrain.sort('enrollment_id') #dtest = dtest.sort('enrollment_id') dtrain.index = range(0,len(dtrain)) dtest.index = range(0,len(dtest)) del dtrain['drop'] X = pd.concat([dtrain,dtest]) X.index = range(0,len(X)) #course_idのダミー変数化 b = categorical(np.array(X['course_id']), drop=True) b = pd.DataFrame(b) #aaa = pd.factorize(X['username'])#usernameのFactorize化 #dtrain['username'] = aaa[0][:len(dtrain)] #dtrain[dtrain['username_count'] <= 5]['username'] = -999 #bbb = pd.factorize(X['course_id'])#course_idのFactorize化 #dtrain['course_id'] = bbb[0][:len(dtrain)] #del dtrain['enrollment_id'],dtrain['username'],dtrain['course_id'], dtrain = pd.concat([b[:len(dtrain)],dtrain],axis=1) #label = dtrain['drop'] #del dtrain['drop'] #dtrain.iloc[:,39:] = dtrain.iloc[:,39:].applymap(f)#対数変換
def test_structarray2dint(self): des = tools.categorical(self.structdes, col=3) test_des = np.column_stack(([des[_] for _ in des.dtype.names[-5:]])) assert_array_equal(test_des, self.dummy) assert_equal(len(des.dtype.names), 9)
def normalize(att_value_hash, boolean_arr_list, integer_arr_list, ignore_arr_list, select_features, category_arr_list): min_max_scaler = preprocessing.MinMaxScaler() newname_arr = [] norm_value_hash = dict() ## keep original boolean att for att_name in boolean_arr_list: if att_name in ignore_arr_list: continue if att_name in select_features: norm_value_hash[att_name] = np.array([att_value_hash[att_name]]) newname_arr.append(att_name) ## normalize interger attributes: convert to [0,1] using min_max scaler minmax_hash = dict() for att_name in integer_arr_list: if att_name in ignore_arr_list: continue if att_name in select_features: val_arr = np.array([[ float(val.strip()) for val in att_value_hash[att_name] ]]).transpose() minmax_hash[att_name] = (min(val_arr), max(val_arr)) norm_val_arr = min_max_scaler.fit_transform(val_arr) norm_value_hash[att_name] = norm_val_arr.transpose() newname_arr.append(att_name) ## normalize categorical features: 1-to-k encode categorical data newcatname_arr = [] for att_name in category_arr_list: if att_name in ignore_arr_list: continue if att_name in select_features: val_arr = np.array([att_value_hash[att_name]]) cat_matrix = categorical( val_arr[0], drop=True, dictnames=True ) ## need to indicate dimension, its the first dimension (0), each row = 1 sample cat_matrix_trans = np.transpose( cat_matrix[0]) ## transpose so that each row = 1 var for index, ori_val in cat_matrix[1].items(): new_att_name = att_name + "-" + ori_val norm_value_hash[new_att_name] = cat_matrix_trans[index] newname_arr.append(new_att_name) newcatname_arr.append(new_att_name) imp = Imputer(missing_values='NaN', strategy='mean', axis=1) b = norm_value_hash[select_features[0]][0] imp.fit(b) b = imp.transform(b) data = np.array(b) for newname in newname_arr: if newname == select_features[0]: continue if newname not in newcatname_arr: b = norm_value_hash[newname][0] imp.fit(b) b = imp.transform(b) data = np.concatenate((data, b), axis=0) else: b = np.array([norm_value_hash[newname]]) data = np.concatenate((data, b), axis=0) return data, newname_arr, newcatname_arr, minmax_hash
def test_structarray1d_drop(self): instr = self.structdes['str_instr'].view(dtype=[('var1', 'a10')]) dum = tools.categorical(instr, drop=True) test_dum = np.column_stack(([dum[_] for _ in dum.dtype.names])) assert_array_equal(test_dum, self.dummy) assert_equal(len(dum.dtype.names), 5)
#log transform f = lambda x: np.log(1 + x**2) if x > 1 else x dtrain = pd.read_csv( '/Users/IkkiTanaka/Documents/KDDCup/fe_train/shuffle_enr_train_depth.csv') dtest = pd.read_csv( '/Users/IkkiTanaka/Documents/KDDCup/fe_test/new_enr_test_depth.csv') label = dtrain['drop'] del dtrain['drop'] X = pd.concat([dtrain, dtest]) X.index = range(0, len(X)) #course_idのダミー変数化 b = categorical(np.array(X['course_id']), drop=True) b = pd.DataFrame(b) #aaa = pd.factorize(X['username'])#usernameのFactorize化 #dtrain['username'] = aaa[0][:len(dtrain)] #dtrain[dtrain['username_count'] <= 5]['username'] = -999 #bbb = pd.factorize(X['course_id'])#course_idのFactorize化 #dtrain['course_id'] = bbb[0][:len(dtrain)] del dtrain['enrollment_id'], dtrain['username'], dtrain['course_id'], dtrain = pd.concat([b[:len(dtrain)], dtrain], axis=1) #label = dtrain['drop'] #del dtrain['drop'] #dtrain.iloc[:,39:] = dtrain.iloc[:,39:].applymap(f)#対数変換
def test_array2d_drop(self): des = np.column_stack((self.des, self.instr, self.des)) des = tools.categorical(des, col=2, drop=True) assert_array_equal(des[:,-5:], self.dummy) assert_equal(des.shape[1],9)
def normalize(): machine = "aws" aws = preprocessing.MinMaxScaler() imp = Imputer(missing_values='NaN', strategy='mean', axis=1) min_max_scaler = preprocessing.MinMaxScaler() if machine == "amm": prefix = "/home/amm/" else: prefix = "/home/ubuntu/Desktop/sna_utcc/" path = prefix + "upwork/data/" f_r = open(path + "appt_dump_transformed.csv", "r") f_w = open(path + "appt_dump_normMinMax.csv", "w") ## new attribute names in transformed file att_name_list = [ "ID", "AllowPush", "AdOptedIn", "NumCampaignMatch", "Carrier", "AppVersion", "AllowiBeacon", "AllowGeo", "AllowFeaturePush", "ScreenHeight", "AllowBT", "HaveUniqueGlobalID", "NumCrash", "DailyUsage", "Country", "LastUpdateDays", "DeviceModel", "BlockPushTF", "BlockPushSameday", "BlockPushAfterDays", "OS", "OSVersion", "RevokePushTF", "RevokePushBefore", "RevokePushSameday", "RevokePushAfterDays", "SignIn", "UninstalledTF", "UninstalledSameday", "UninstalledAfter", "ScreenWidth", "EmailExist", "EmailAddress", "InstallDays", "PushCount", "Timezone", "UserType", "Questions", "CorrectQuestion" ] print len(att_name_list) boolean_arr_list = [ "AllowPush", "AdOptedIn", "AllowiBeacon", "AllowGeo", "AllowFeaturePush", "AllowBT", "HaveUniqueGlobalID", "SignIn", "EmailExist", "EmailAddress", "BlockPushTF", "BlockPushSameday", "RevokePushTF", "RevokePushBefore", "RevokePushSameday", "UninstalledTF", "UninstalledSameday" ] category_arr_list = [ "Carrier", "AppVersion", "DeviceModel", "OS", "UserType", "OSVersion", "Timezone", "ScreenWidth", "ScreenHeight", "Country" ] integer_arr_list = [ "NumCampaignMatch", "NumCrash", "DailyUsage", "InstallDays", "PushCount", "Questions", "CorrectQuestion", "BlockPushAfterDays", "RevokePushAfterDays", "UninstalledAfter", "LastUpdateDays" ] ignore_arr_list = [ "ID", "Carrier", "AdOptedIn", "AllowiBeacon", "HaveUniqueGlobalID", "OS", "SignIn", "EmailExist", "UserType" ] #print len(boolean_arr_list)+len(category_arr_list)+len(integer_arr_list) #print set(att_name_list).difference(set(boolean_arr_list).union(category_arr_list).union(integer_arr_list)) att_value_hash = dict() norm_value_hash = dict() newname_arr = [] for att_name in att_name_list: att_value_hash[att_name] = [] for line in f_r.readlines()[1::]: att_arr = line.strip().split(",") if len(att_arr) > 39: print att_arr for val, att_name in zip(att_arr, att_name_list): val = val.strip() if val == "None": val = np.nan att_value_hash[att_name].append(val) b = att_value_hash["AllowPush"] imp.fit(b) impb = imp.transform(b) original_data = impb #print original_data.shape for att_name, val_arr in att_value_hash.items(): if att_name == "ID": continue b = np.array([att_value_hash[att_name]]) #print att_name if att_name in category_arr_list: cat_matrix, cat_dict = categorical(b, drop=True, dictnames=True) inv_dict = {v: k for k, v in cat_dict.items()} newb = np.array([[inv_dict[d] for d in b[0]]]) #print newb else: newb = b imp.fit(newb) impb = imp.transform(newb) original_data = np.concatenate((original_data, impb), axis=0) k = 30 ## Select features pca = PCA(n_components=1) transpose_data = original_data.transpose() pca.fit(transpose_data) #new_data = pca.fit_transform(original_data) topk_arr = np.abs(pca.components_[0]).argsort()[::-1][:k] select_features = list( set([att_name_list[i] for i in topk_arr]).difference(set(ignore_arr_list))) ## normalize feature values #norm_value_hash["ID"] = np.array([att_value_hash["ID"]]) ## keep original boolean att for att_name in boolean_arr_list: if att_name in ignore_arr_list: continue if att_name in select_features: norm_value_hash[att_name] = np.array([att_value_hash[att_name]]) newname_arr.append(att_name) ## normalize interger attributes minmax_hash = dict() for att_name in integer_arr_list: if att_name in ignore_arr_list: continue if att_name in select_features: val_arr = np.array([[ float(val.strip()) for val in att_value_hash[att_name] ]]).transpose() minmax_hash[att_name] = (min(val_arr), max(val_arr)) norm_val_arr = min_max_scaler.fit_transform(val_arr) norm_value_hash[att_name] = norm_val_arr.transpose() newname_arr.append(att_name) ## 1-to-k encode categorical data newcatname_arr = [] for att_name in category_arr_list: if att_name in ignore_arr_list: continue if att_name in select_features: val_arr = np.array([att_value_hash[att_name]]) #print att_name #print set(val_arr[0]) cat_matrix = categorical( val_arr[0], drop=True, dictnames=True ) ## need to indicate dimension, its the first dimension (0), each row = 1 sample cat_matrix_trans = np.transpose( cat_matrix[0]) ## transpose so that each row = 1 var for index, ori_val in cat_matrix[1].items(): #print ori_val #print cat_matrix_trans[index] new_att_name = att_name + "-" + ori_val norm_value_hash[new_att_name] = cat_matrix_trans[index] newname_arr.append(new_att_name) newcatname_arr.append(new_att_name) b = norm_value_hash[select_features[0]][0] #print b imp.fit(b) b = imp.transform(b) data = np.array(b) for newname in newname_arr: #print newname #print data.shape if newname == select_features[0]: continue if newname not in newcatname_arr: b = norm_value_hash[newname][0] #print b imp.fit(b) b = imp.transform(b) #print b #print b.shape data = np.concatenate((data, b), axis=0) else: b = np.array([norm_value_hash[newname]]) #print b.shape data = np.concatenate((data, b), axis=0) data = np.transpose(data) ## row = one sample ## perform k-mean and select the best k maxsilh = float('-inf') centroid_best = [] kbest = 0 for ncluster in range(4, 5): centroid_arr, silhouette_avg = cluster(data, ncluster) if silhouette_avg > maxsilh: maxsilh = silhouette_avg centroid_best = centroid_arr kbest = ncluster #print (ncluster, silhouette_avg) #noncatname_arr = list(set(newname_arr).difference(newcatname_arr)) print "Best k = " + str(kbest) ## convert dummy back to original cat_hash_cluster = dict() for centroid, no in zip(centroid_best, range(0, kbest)): cat_hash_cluster[no] = dict() max_hash = dict() max_val = dict() #print centroid for attname in newcatname_arr: max_hash[attname] = 0 max_val[attname] = "" for cval, attname in zip(centroid, newname_arr): if attname in newcatname_arr: #print (attname, cval) if attname.startswith("Time"): mainname = "Timezone" catname = attname.replace("Timezone-", "") else: mainname, catname = attname.split("-") if cval > max_hash[attname]: max_hash[attname] = cval cat_hash_cluster[no][mainname] = catname ''' for cno in cat_hash_cluster.keys(): print cno cat_hash = cat_hash_cluster[cno] for name, val in cat_hash.items(): print (name, val) ''' for centroid, no in zip(centroid_best, range(0, kbest)): noncat_cenarr_val = [] noncat_cenarr_name = [] cat_cenarr_val = [] cat_cenarr_name = [] for cval, attname in zip(centroid, newname_arr): if attname in newcatname_arr: if attname.startswith("Time"): mainname = "Timezone" catname = attname.replace("Timezone-", "") else: mainname, catname = attname.split("-") cval = cat_hash_cluster[no][mainname] cat_cenarr_val.append(cval) cat_cenarr_name.append(mainname) else: noncat_cenarr_val.append(cval) noncat_cenarr_name.append(attname) for name in noncat_cenarr_name: print name for name in cat_hash_cluster[no].keys(): print name print "" print "\nCluster no " + str(no) for val, name in zip(noncat_cenarr_val, noncat_cenarr_name): #print val if name in integer_arr_list: ori = val * (minmax_hash[name][1][0] - minmax_hash[name][0][0]) + minmax_hash[name][0][0] print ori elif name in boolean_arr_list: #print str(bool(val>=0.5 )) print val else: print val cat_hash = cat_hash_cluster[no] for name, val in cat_hash.items(): print val print "" #print cat_cenarr_name #print cat_cenarr_val #print "" '''