def blend(file_name_list, time_str, sum=False): prob = [ pd.read_csv(os.path.join(get_root_path(), "prediction", time_str, i)).sort_values(by="seller_id", axis=0, kind="mergesort").sort_values( by="user_id", axis=0, kind="mergesort")['prob'] for i in file_name_list ] if sum: exp = [i for i in prob] final_prob = np.sum(exp, axis=0) / len(exp) else: exp = [sigmoid_ver(i) for i in prob] final_prob = sigmoid(np.sum(exp, axis=0) / len(exp)) us_df = pd.read_csv( os.path.join(get_root_path(), "prediction", time_str, file_name_list[0])).sort_values( by="seller_id", axis=0, kind="mergesort").sort_values(by="user_id", axis=0, kind="mergesort") us_df['prob'] = final_prob us_df.to_csv(os.path.join(get_root_path(), "prediction", time_str, "blending.csv"), index=False, float_format='%.16f')
def get_outside_train_features(): train = pd.read_csv( os.path.join(get_root_path(), "features", "safe_type_train.csv")) train.rename(columns={"id": "file_name"}, inplace=True) full_features = pd.read_csv(os.path.join(get_root_path(), "features", "outside.csv"), index_col=0) full_features["file_name"] = full_features["file_name"].map( lambda x: extract_id_from_file_name(x)) test_name_list = load_dict( os.path.join(get_root_path(), "features", "test_name_list")) test_data = pd.DataFrame(columns=["file_name"], data=np.array(test_name_list)) test_data["file_name"] = test_data["file_name"].map( lambda x: extract_id_from_file_name(x)) # merge train_data = pd.merge(train, full_features, "left", on="file_name") test_data = pd.merge(test_data, full_features, "left", on="file_name") label = train_data["safe_type"] train_data.drop(columns=["safe_type"], inplace=True) return train_data, label, test_data
def load_ft_features(feature_files=None): if feature_files is None: feature_files = { "black": "black_features.csv", "white": "white_features.csv", "test": "test_features.csv" } train = pd.read_csv( os.path.join(get_root_path(), "features", "safe_type_train.csv")) train.rename(columns={"id": "file_name"}, inplace=True) black_features = pd.read_csv( os.path.join(get_root_path(), "features", feature_files["black"])) white_features = pd.read_csv( os.path.join(get_root_path(), "features", feature_files["white"])) full_features = pd.concat([black_features, white_features]) full_features["file_name"] = full_features["file_name"].map( lambda x: extract_id_from_file_name(x)) # load test data test_data = pd.read_csv( os.path.join(get_root_path(), "features", feature_files["test"])) test_data["file_name"] = test_data["file_name"].map( lambda x: extract_id_from_file_name(x)) # merge train_dat = pd.merge(train, full_features, "inner", on="file_name") label = train_dat["safe_type"] train_dat.drop(columns=["safe_type"], inplace=True) return train_dat, label, test_data
def stage2_api_new(feature_num=500): api_vec = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=feature_num) stage2 = load_df(os.path.join("features", "stage2")) black_output, name_list = to_str(stage2, mode=1) print(1) api_vec.fit(black_output) print(2) # black_output, name_list = to_str(stage2, mode=1) save_dict( name_list, os.path.join(get_root_path(), "features", "stage2_name_list" + str(feature_num))) black_output = api_vec.transform(black_output) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "stage2" + str(feature_num) + ".npz"), black_output)
def stage_2_attribute(suffix="_dll", use_less_value=False, type_name="", map_func=None, max_feature=1000): stage2 = load_df(os.path.join("features", "stage2" + suffix), mode=1) if use_less_value: if map_func is None: stage2["value"] = stage2["value"].map(lambda x: x.split("\\")[-1]) else: stage2["value"] = stage2["value"].map(lambda x: map_func(x)) stage2_output, name_list = to_str(stage2, mode=1, column_name="value") api_vec, _ = train_tf_idf(suffix="_dll", use_less_value=use_less_value, map_func=map_func, data=stage2_output, max_feature=max_feature) save_dict( name_list, os.path.join(get_root_path(), "features", "stage2_name_list" + suffix + type_name)) stage2_output = api_vec.transform(stage2_output) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "stage2" + suffix + type_name + ".npz"), stage2_output)
def load_clustering_statics_files(): full_features = pd.read_csv(os.path.join(get_root_path(), "features", "outside_stage2.csv"), index_col=0) full_features["file_name"] = full_features["file_name"].map( lambda x: extract_id_from_file_name(x)) return full_features
def load_stage2_tf_idf(suffix, type_name=""): stage2 = scipy.sparse.load_npz( os.path.join(get_root_path(), "features", "stage2" + suffix + type_name + ".npz")).toarray() train_data = pd.DataFrame(stage2) stage2_name_list = load_dict( os.path.join(get_root_path(), "features", "stage2_name_list" + suffix + type_name)) train_data["file_name"] = stage2_name_list train_data["file_name"] = train_data["file_name"].map( lambda x: extract_id_from_file_name(x)) return train_data
def train_tf_idf(suffix="_dll", use_less_value=False, map_func=None, max_feature=2000, data=None): api_vec = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=max_feature) if data is None: white = load_df(os.path.join(get_root_path(), "features", "white" + suffix), mode=1) black = load_df(os.path.join(get_root_path(), "features", "black" + suffix), mode=1) test = load_df(os.path.join(get_root_path(), "features", "test" + suffix), mode=1) if use_less_value: if map_func is None: for i in [white, black, test]: i["value"] = i["value"].map(lambda x: x.split("\\")[-1]) else: for i in [white, black, test]: i["value"] = i["value"].map(lambda x: map_func(x)) full = pd.concat([white, black, test]) full_str = to_str(full, column_name="value") else: full_str = data print(1) api_vec.fit(full_str) print(2) if data is None: return api_vec, [white, black, test] else: return api_vec, None
def load_tianchi_tf_idf(): stage2 = scipy.sparse.load_npz( os.path.join(get_root_path(), "features", "tianchi" + ".npz")).toarray() train_data = pd.DataFrame(stage2) stage2_name_list = load_dict( os.path.join(get_root_path(), "features", "tianchi_name_list")) train_data["file_name"] = stage2_name_list train_data["file_name"] = train_data["file_name"].map( lambda x: extract_id_from_file_name(x)) tianchi = pd.read_csv("security_train.csv")[["label", "file_id"]].drop_duplicates() tianchi = tianchi.rename(columns={"file_id": "file_name"}) full = pd.merge(train_data, tianchi, how="left", on="file_name") label = full["label"] return train_data, label
def load_tfidf_sparse_features(suffix): black = scipy.sparse.load_npz( os.path.join(get_root_path(), "black" + suffix + ".npz")) white = scipy.sparse.load_npz( os.path.join(get_root_path(), "white" + suffix + ".npz")) test = scipy.sparse.load_npz( os.path.join(get_root_path(), "test" + suffix + ".npz")) white_file_id = load_dict("white_name_list") black_file_id = load_dict("black_name_list") black_l = np.ones((black.shape[0], )) white_l = np.zeros((white.shape[0], )) train_data = scipy.sparse.vstack([black, white]) label = pd.DataFrame(np.concatenate((black_l, white_l), axis=0)) test_df = test file_id = load_dict( os.path.join(get_root_path(), "test_name_list" + suffix)) return train_data, label, test_df, file_id, np.array(black_file_id + white_file_id)
def load_autoencoder_features(): features = np.load("train_nn.npy") print(features.shape) name = np.load("file_name_list_stage2.npy") label = np.load("label_nn.npy") features = pd.DataFrame(data=features) features["file_name"] = name features["file_name"] = features["file_name"].map( lambda x: extract_id_from_file_name(x)) """ useless test_df. just read to avoid error """ test = scipy.sparse.load_npz( os.path.join(get_root_path(), "features", "test.npz")).toarray() test_df = pd.DataFrame(test) test_name_list = load_dict( os.path.join(get_root_path(), "features", "test_name_list")) test_df["file_name"] = test_name_list test_df["file_name"] = test_df["file_name"].map( lambda x: extract_id_from_file_name(x)) return features, label, test_df
def load_nn_features(): train = pd.read_csv( os.path.join(get_root_path(), "features", "safe_type_train.csv")) train.rename(columns={"id": "file_name"}, inplace=True) train_features = pd.read_csv( os.path.join(get_root_path(), "features", "train_nn.csv")) train_features["file_name"] = train_features["file_name"].map( lambda x: extract_id_from_file_name(x)) # load test data test_data = pd.read_csv( os.path.join(get_root_path(), "features", "test_nn.csv")) test_data["file_name"] = test_data["file_name"].map( lambda x: extract_id_from_file_name(x)) # merge train_dat = pd.merge(train, train_features, "inner", on="file_name") label = train_dat["safe_type"] train_dat.drop(columns=["safe_type"], inplace=True) return train_dat, label, test_data
def load_tfidf_features(suffix, type_name=""): black = scipy.sparse.load_npz( os.path.join(get_root_path(), "features", "black" + suffix + type_name + ".npz")).toarray() white = scipy.sparse.load_npz( os.path.join(get_root_path(), "features", "white" + suffix + type_name + ".npz")).toarray() test = scipy.sparse.load_npz( os.path.join(get_root_path(), "features", "test" + suffix + type_name + ".npz")).toarray() black_l = np.ones((black.shape[0], )) white_l = np.zeros((white.shape[0], )) train_data = pd.DataFrame(np.concatenate((black, white), axis=0)) label = pd.DataFrame(np.concatenate((black_l, white_l), axis=0)) test_df = pd.DataFrame(test) black_name_list = load_dict( os.path.join(get_root_path(), "features", "black_name_list" + suffix + type_name)) white_name_list = load_dict( os.path.join(get_root_path(), "features", "white_name_list" + suffix + type_name)) train_name_list = np.concatenate((black_name_list, white_name_list), axis=0) test_name_list = load_dict( os.path.join(get_root_path(), "features", "test_name_list" + suffix + type_name)) train_data["file_name"] = train_name_list train_data["file_name"] = train_data["file_name"].map( lambda x: extract_id_from_file_name(x)) test_df["file_name"] = test_name_list test_df["file_name"] = test_df["file_name"].map( lambda x: extract_id_from_file_name(x)) return train_data, label, test_df
def attribution(suffix="_dll", use_less_value=False, type_name="", map_func=None, max_feature=2000): api_vec, data = train_tf_idf(suffix="_dll", use_less_value=use_less_value, map_func=map_func, max_feature=max_feature) white, black, test = data black_output, name_list = to_str(black, mode=1, column_name="value") save_dict( name_list, os.path.join(get_root_path(), "features", "black_name_list" + suffix + type_name)) black_output = api_vec.transform(black_output) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "black" + suffix + type_name + ".npz"), black_output) white_output, name_list = to_str(white, mode=1, column_name="value") save_dict( name_list, os.path.join(get_root_path(), "features", "white_name_list" + suffix + type_name)) white_output = api_vec.transform(white_output) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "white" + suffix + type_name + ".npz"), white_output) test_str, name_list = to_str(test, mode=1, column_name="value") save_dict( name_list, os.path.join(get_root_path(), "features", "test_name_list" + suffix + type_name)) test_output = api_vec.transform(test_str) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "test" + suffix + type_name + ".npz"), test_output)
axis=0, kind="mergesort")['prob'] for i in file_name_list ] if sum: exp = [i for i in prob] final_prob = np.sum(exp, axis=0) / len(exp) else: exp = [sigmoid_ver(i) for i in prob] final_prob = sigmoid(np.sum(exp, axis=0) / len(exp)) us_df = pd.read_csv( os.path.join(get_root_path(), "prediction", time_str, file_name_list[0])).sort_values( by="seller_id", axis=0, kind="mergesort").sort_values(by="user_id", axis=0, kind="mergesort") us_df['prob'] = final_prob us_df.to_csv(os.path.join(get_root_path(), "prediction", time_str, "blending.csv"), index=False, float_format='%.16f') if __name__ == '__main__': name = "new" blend(file_name_list=get_file_list_in_dir( os.path.join(get_root_path(), "prediction", name)), time_str=name, sum=False)
from keras_preprocessing.sequence import pad_sequences from sklearn.model_selection import train_test_split from basic_function import load_dict, load_df, get_root_path, save_dict from metrics import com_acc from model import get_model from shorten_api_list import delete_repeat_pattern, delete_same_pattern import pickle shape = (512, 64) input_dim = 92 + 1 batch_size = 32 epochs = 50 class_num = 2 api_dict = load_dict(os.path.join(get_root_path(), "features", "api_dict.txt")) white = load_df(os.path.join(get_root_path(), "features", "white"), mode=1)[['file_name', 'api_name', 'call_time']] black = load_df(os.path.join(get_root_path(), "features", "black"), mode=1)[['file_name', 'api_name', 'call_time']] white_label = np.zeros(white.shape[0]) black_label = np.ones(black.shape[0]) full = pd.concat([white, black], sort=False) label = np.concatenate((white_label, black_label)) full['label'] = label full['api_name'] = full['api_name'].map(api_dict) # full = load_df(os.path.join(get_root_path(), "features", "stage2"), mode=1)[['file_name', 'api_name', 'call_time']]