def setUp(self): # pylint: disable=g-missing-super-call self.data = read_data("./data/complete_data") self.num_dic = {"wing": 0, "ring": 0, "slope": 0, "negative": 0} with open("./data/complete_data", "r") as f: lines = f.readlines() self.num = len(lines)
print("train_length:" + str(len(train_data))) print("valid_length:" + str(len(valid_data))) print("test_length:" + str(len(test_data))) return train_data, valid_data, test_data # Write data to file def write_data(data_to_write, path): with open(path, "w") as f: for idx, item in enumerate(data_to_write): # pylint: disable=unused-variable,redefined-outer-name dic = json.dumps(item, ensure_ascii=False) f.write(dic) f.write("\n") if __name__ == "__main__": data = read_data("./data/complete_data") train_names = [ "hyw", "shiyun", "tangsy", "dengyl", "jiangyh", "xunkai", "negative3", "negative4", "negative5", "negative6" ] valid_names = ["lsj", "pengxl", "negative2", "negative7"] test_names = ["liucx", "zhangxy", "negative1", "negative8"] train_data, valid_data, test_data = person_split(data, train_names, valid_names, test_names) if not os.path.exists("./person_split"): os.makedirs("./person_split") write_data(train_data, "./person_split/train") write_data(valid_data, "./person_split/valid") write_data(test_data, "./person_split/test")
'at_flag', 'dig_ratio', 'special_ch', 'special_ch_kind', 'TLD_id', 'hash_token_n', 'hostname_a', 'hostname_b', 'hostname_c', 'hostname_ch_n', 'hostname_d', 'hostname_dig_ratio', 'hostname_e', 'hostname_entropy', 'hostname_f', 'hostname_g', 'hostname_h', 'hostname_i', 'hostname_is_ip', 'hostname_j', 'hostname_k', 'hostname_l', 'hostname_len', 'hostname_letter_ratio', 'hostname_m', 'hostname_n', 'hostname_o', 'hostname_p', 'hostname_point_n', 'hostname_q', 'hostname_r', 'hostname_s', 'hostname_std', 'hostname_t', 'hostname_token_n', 'hostname_u', 'hostname_v', 'hostname_w', 'hostname_x', 'hostname_y', 'hostname_z', 'pathname_ch_kind', 'pathname_depth', 'pathname_len', 'pathname_longest_token', 'pathname_std', 'pathname_token_n', 'search_and_n', 'search_len', 'search_std', 'search_token_n' ] if __name__ == '__main__': data_train, data_cv, data_test = data_split.read_data() # use_sklearn.multi_machine_learing_models(data_train,data_cv) print('---------------cv') y_cv = data_cv['label'].apply(lambda x: 0 if x == 'good' else 1) x_cv = data_cv.drop(['URL', 'label'], axis=1) use_sklearn.vote_to_predict(x_cv, y_cv) print('---------------test') y_test = data_test['label'].apply(lambda x: 0 if x == 'good' else 1) x_test = data_test.drop(['URL', 'label'], axis=1) use_sklearn.vote_to_predict(x_test, y_test) print('---------------combine') x_fish = pd.read_csv("fishtank_features.csv")