def generator(): ll_data_2g = utils.gongcan_to_ll() train_data = utils.ll_to_grid(ll_data_2g) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) train_data.to_csv("X.csv") # 接下来是针对 cnn 进行的预处理 train_scaled = DataFrame() # 归一化 labels = [ 'RNCID_', 'CellID_', 'AsuLevel_', 'SignalLevel_', 'RSSI_', 'Latitude_', 'Longitude_' ] for label in labels: tmp = DataFrame() for i in range(1, 8): tmp = pd.concat([tmp, train_data[label + str(i)]], axis=1) tmp_index = tmp.columns.tolist() tmp = tmp.as_matrix() # tmp_scaled = scale(tmp) min_max_scaler = MinMaxScaler(copy=True, feature_range=(0, 1)) tmp_scaled = min_max_scaler.fit_transform(tmp) tmp_scaled = DataFrame(tmp_scaled, columns=tmp_index) train_scaled = pd.concat([train_scaled, tmp_scaled], axis=1) # train_scaled.to_csv('X_scaled.csv') train_scaled = pd.concat([ train_scaled, train_data[['IMSI', 'MRTime', 'Longitude', 'Latitude', 'grid_num']] ], axis=1) X_ = [] y_ = [] for index, row in train_scaled.iterrows(): y_.append(row['grid_num']) x_ = [] for i in range(1, 8): tmp = [] for label in labels: tmp.append(row[label + str(i)]) x_.append(tmp) X_.append(x_) # X 是生成好的 7*7 数组,作为 feature # y_ 是生成好的 label X = np.array(X_) y_ = np.array(y_) # 对生成好的 label 做 onehot 编码 y = np.zeros(shape=(y_.shape[0], 13 * 17)) # print(y.shape) for i in range(y_.shape[0]): y[i][int(y_[i])] = 1 return X, y
# test_index_label_sampled_np = test_index_label_sampled.as_matrix() test_index_sampled = test_index_label_sampled['id'] test_index_sampled_np = test_index_sampled.as_matrix() # test_label_sampled = test_index_label_sampled['type'] # test_label_sampled_np = test_label_sampled.as_matrix() count1 = 0 predict_data_pd = pd.DataFrame(np.zeros((test_index_sampled_np.shape[0], 2)), columns=['id', 'type']) for test_index, test_filename_str in enumerate(test_index_sampled_np): test_filename = test_data_path + str(test_filename_str) + '.txt' test_file = open(test_filename) data_test = test_file.read() data_test = data_test.split(',') data_test = DataFrame(data_test).T data_test_np = data_test.as_matrix() print(data_test_np) # y_predict = bgc.predict(data_test) y_predict = bst.predict(data_test_np) y_predict = y_predict.tolist() y_predict = y_predict[0].index(max(y_predict[0])) print(y_predict) predict_data_pd.iloc[count1, 0] = test_filename_str predict_data_pd.iloc[count1, 1] = y_predict print(y_predict) # test_data_pd.iloc[count1, 0:2600] = data_test.iloc[0,:] # test_data_pd.iloc[count1, 2600] = test_filename_str count1 = count1 + 1 print(count1) print(predict_data_pd.head()) map_result = {0: 'star', 1: 'unknown', 2: 'galaxy', 3: 'qso'}
# test_index_label_sampled = test_index_label.sample(frac = 0.01, replace = False, axis = 0) test_index_label_sampled = test_index_label # test_index_label_sampled_np = test_index_label_sampled.as_matrix() test_index_sampled = test_index_label_sampled['id'] test_index_sampled_np = test_index_sampled.as_matrix() # test_label_sampled = test_index_label_sampled['type'] # test_label_sampled_np = test_label_sampled.as_matrix() count1 = 0 predict_data_pd = pd.DataFrame(np.zeros((test_index_sampled_np.shape[0], 2)), columns = ['id','type']) for test_index, test_filename_str in enumerate(test_index_sampled_np): test_filename = test_data_path + str(test_filename_str) + '.txt' test_file = open(test_filename) data_test = test_file.read() data_test = data_test.split(',') data_test = DataFrame(data_test).T data_test_np = data_test.as_matrix() # y_predict = bgc.predict(data_test) y_predict = xgb1.predict(data_test_np) predict_data_pd.iloc[count1,0] = test_filename_str predict_data_pd.iloc[count1,1] = y_predict print(y_predict) # test_data_pd.iloc[count1, 0:2600] = data_test.iloc[0,:] # test_data_pd.iloc[count1, 2600] = test_filename_str count1 = count1 + 1 print(count1) print(predict_data_pd.head()) map_result = {0:'star', 1:'unknown', 2:'galaxy',3:'qso'} predict_data_pd['type'] = predict_data_pd['type'].map(map_result) predict_data_pd.to_csv('G:/Python/Tianwen/predict_data_0.16_XGBoost.csv')