Ejemplo n.º 1
0
def blend(file_name_list, time_str, sum=False):
    prob = [
        pd.read_csv(os.path.join(get_root_path(), "prediction", time_str,
                                 i)).sort_values(by="seller_id",
                                                 axis=0,
                                                 kind="mergesort").sort_values(
                                                     by="user_id",
                                                     axis=0,
                                                     kind="mergesort")['prob']
        for i in file_name_list
    ]
    if sum:
        exp = [i for i in prob]
        final_prob = np.sum(exp, axis=0) / len(exp)
    else:
        exp = [sigmoid_ver(i) for i in prob]
        final_prob = sigmoid(np.sum(exp, axis=0) / len(exp))
    us_df = pd.read_csv(
        os.path.join(get_root_path(), "prediction", time_str,
                     file_name_list[0])).sort_values(
                         by="seller_id", axis=0,
                         kind="mergesort").sort_values(by="user_id",
                                                       axis=0,
                                                       kind="mergesort")
    us_df['prob'] = final_prob
    us_df.to_csv(os.path.join(get_root_path(), "prediction", time_str,
                              "blending.csv"),
                 index=False,
                 float_format='%.16f')
Ejemplo n.º 2
0
def get_outside_train_features():
    train = pd.read_csv(
        os.path.join(get_root_path(), "features", "safe_type_train.csv"))
    train.rename(columns={"id": "file_name"}, inplace=True)
    full_features = pd.read_csv(os.path.join(get_root_path(), "features",
                                             "outside.csv"),
                                index_col=0)
    full_features["file_name"] = full_features["file_name"].map(
        lambda x: extract_id_from_file_name(x))

    test_name_list = load_dict(
        os.path.join(get_root_path(), "features", "test_name_list"))
    test_data = pd.DataFrame(columns=["file_name"],
                             data=np.array(test_name_list))
    test_data["file_name"] = test_data["file_name"].map(
        lambda x: extract_id_from_file_name(x))

    # merge
    train_data = pd.merge(train, full_features, "left", on="file_name")
    test_data = pd.merge(test_data, full_features, "left", on="file_name")

    label = train_data["safe_type"]

    train_data.drop(columns=["safe_type"], inplace=True)

    return train_data, label, test_data
Ejemplo n.º 3
0
def load_ft_features(feature_files=None):
    if feature_files is None:
        feature_files = {
            "black": "black_features.csv",
            "white": "white_features.csv",
            "test": "test_features.csv"
        }
    train = pd.read_csv(
        os.path.join(get_root_path(), "features", "safe_type_train.csv"))
    train.rename(columns={"id": "file_name"}, inplace=True)

    black_features = pd.read_csv(
        os.path.join(get_root_path(), "features", feature_files["black"]))
    white_features = pd.read_csv(
        os.path.join(get_root_path(), "features", feature_files["white"]))
    full_features = pd.concat([black_features, white_features])
    full_features["file_name"] = full_features["file_name"].map(
        lambda x: extract_id_from_file_name(x))

    # load test data
    test_data = pd.read_csv(
        os.path.join(get_root_path(), "features", feature_files["test"]))
    test_data["file_name"] = test_data["file_name"].map(
        lambda x: extract_id_from_file_name(x))

    # merge
    train_dat = pd.merge(train, full_features, "inner", on="file_name")

    label = train_dat["safe_type"]
    train_dat.drop(columns=["safe_type"], inplace=True)
    return train_dat, label, test_data
Ejemplo n.º 4
0
def stage2_api_new(feature_num=500):
    api_vec = TfidfVectorizer(ngram_range=(1, 5),
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1,
                              max_features=feature_num)

    stage2 = load_df(os.path.join("features", "stage2"))

    black_output, name_list = to_str(stage2, mode=1)

    print(1)
    api_vec.fit(black_output)

    print(2)

    # black_output, name_list = to_str(stage2, mode=1)
    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "stage2_name_list" + str(feature_num)))
    black_output = api_vec.transform(black_output)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "stage2" + str(feature_num) + ".npz"), black_output)
Ejemplo n.º 5
0
def stage_2_attribute(suffix="_dll",
                      use_less_value=False,
                      type_name="",
                      map_func=None,
                      max_feature=1000):
    stage2 = load_df(os.path.join("features", "stage2" + suffix), mode=1)

    if use_less_value:
        if map_func is None:
            stage2["value"] = stage2["value"].map(lambda x: x.split("\\")[-1])
        else:
            stage2["value"] = stage2["value"].map(lambda x: map_func(x))
    stage2_output, name_list = to_str(stage2, mode=1, column_name="value")
    api_vec, _ = train_tf_idf(suffix="_dll",
                              use_less_value=use_less_value,
                              map_func=map_func,
                              data=stage2_output,
                              max_feature=max_feature)

    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "stage2_name_list" + suffix + type_name))
    stage2_output = api_vec.transform(stage2_output)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "stage2" + suffix + type_name + ".npz"), stage2_output)
Ejemplo n.º 6
0
def load_clustering_statics_files():
    full_features = pd.read_csv(os.path.join(get_root_path(), "features",
                                             "outside_stage2.csv"),
                                index_col=0)
    full_features["file_name"] = full_features["file_name"].map(
        lambda x: extract_id_from_file_name(x))
    return full_features
Ejemplo n.º 7
0
def load_stage2_tf_idf(suffix, type_name=""):
    stage2 = scipy.sparse.load_npz(
        os.path.join(get_root_path(), "features",
                     "stage2" + suffix + type_name + ".npz")).toarray()

    train_data = pd.DataFrame(stage2)

    stage2_name_list = load_dict(
        os.path.join(get_root_path(), "features",
                     "stage2_name_list" + suffix + type_name))

    train_data["file_name"] = stage2_name_list
    train_data["file_name"] = train_data["file_name"].map(
        lambda x: extract_id_from_file_name(x))

    return train_data
Ejemplo n.º 8
0
def train_tf_idf(suffix="_dll",
                 use_less_value=False,
                 map_func=None,
                 max_feature=2000,
                 data=None):
    api_vec = TfidfVectorizer(ngram_range=(1, 5),
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1,
                              max_features=max_feature)

    if data is None:
        white = load_df(os.path.join(get_root_path(), "features",
                                     "white" + suffix),
                        mode=1)
        black = load_df(os.path.join(get_root_path(), "features",
                                     "black" + suffix),
                        mode=1)
        test = load_df(os.path.join(get_root_path(), "features",
                                    "test" + suffix),
                       mode=1)

        if use_less_value:
            if map_func is None:
                for i in [white, black, test]:
                    i["value"] = i["value"].map(lambda x: x.split("\\")[-1])
            else:
                for i in [white, black, test]:
                    i["value"] = i["value"].map(lambda x: map_func(x))

        full = pd.concat([white, black, test])
        full_str = to_str(full, column_name="value")
    else:
        full_str = data

    print(1)
    api_vec.fit(full_str)
    print(2)
    if data is None:
        return api_vec, [white, black, test]
    else:
        return api_vec, None
Ejemplo n.º 9
0
def load_tianchi_tf_idf():
    stage2 = scipy.sparse.load_npz(
        os.path.join(get_root_path(), "features",
                     "tianchi" + ".npz")).toarray()

    train_data = pd.DataFrame(stage2)

    stage2_name_list = load_dict(
        os.path.join(get_root_path(), "features", "tianchi_name_list"))

    train_data["file_name"] = stage2_name_list
    train_data["file_name"] = train_data["file_name"].map(
        lambda x: extract_id_from_file_name(x))
    tianchi = pd.read_csv("security_train.csv")[["label",
                                                 "file_id"]].drop_duplicates()
    tianchi = tianchi.rename(columns={"file_id": "file_name"})
    full = pd.merge(train_data, tianchi, how="left", on="file_name")
    label = full["label"]

    return train_data, label
Ejemplo n.º 10
0
def load_tfidf_sparse_features(suffix):
    black = scipy.sparse.load_npz(
        os.path.join(get_root_path(), "black" + suffix + ".npz"))
    white = scipy.sparse.load_npz(
        os.path.join(get_root_path(), "white" + suffix + ".npz"))
    test = scipy.sparse.load_npz(
        os.path.join(get_root_path(), "test" + suffix + ".npz"))

    white_file_id = load_dict("white_name_list")
    black_file_id = load_dict("black_name_list")

    black_l = np.ones((black.shape[0], ))
    white_l = np.zeros((white.shape[0], ))
    train_data = scipy.sparse.vstack([black, white])
    label = pd.DataFrame(np.concatenate((black_l, white_l), axis=0))

    test_df = test
    file_id = load_dict(
        os.path.join(get_root_path(), "test_name_list" + suffix))
    return train_data, label, test_df, file_id, np.array(black_file_id +
                                                         white_file_id)
Ejemplo n.º 11
0
def load_autoencoder_features():
    features = np.load("train_nn.npy")
    print(features.shape)
    name = np.load("file_name_list_stage2.npy")
    label = np.load("label_nn.npy")
    features = pd.DataFrame(data=features)
    features["file_name"] = name
    features["file_name"] = features["file_name"].map(
        lambda x: extract_id_from_file_name(x))
    """
    useless test_df. just read to avoid error
    """
    test = scipy.sparse.load_npz(
        os.path.join(get_root_path(), "features", "test.npz")).toarray()
    test_df = pd.DataFrame(test)
    test_name_list = load_dict(
        os.path.join(get_root_path(), "features", "test_name_list"))

    test_df["file_name"] = test_name_list
    test_df["file_name"] = test_df["file_name"].map(
        lambda x: extract_id_from_file_name(x))
    return features, label, test_df
Ejemplo n.º 12
0
def load_nn_features():
    train = pd.read_csv(
        os.path.join(get_root_path(), "features", "safe_type_train.csv"))
    train.rename(columns={"id": "file_name"}, inplace=True)

    train_features = pd.read_csv(
        os.path.join(get_root_path(), "features", "train_nn.csv"))
    train_features["file_name"] = train_features["file_name"].map(
        lambda x: extract_id_from_file_name(x))

    # load test data
    test_data = pd.read_csv(
        os.path.join(get_root_path(), "features", "test_nn.csv"))
    test_data["file_name"] = test_data["file_name"].map(
        lambda x: extract_id_from_file_name(x))

    # merge
    train_dat = pd.merge(train, train_features, "inner", on="file_name")

    label = train_dat["safe_type"]
    train_dat.drop(columns=["safe_type"], inplace=True)
    return train_dat, label, test_data
Ejemplo n.º 13
0
def load_tfidf_features(suffix, type_name=""):
    black = scipy.sparse.load_npz(
        os.path.join(get_root_path(), "features",
                     "black" + suffix + type_name + ".npz")).toarray()
    white = scipy.sparse.load_npz(
        os.path.join(get_root_path(), "features",
                     "white" + suffix + type_name + ".npz")).toarray()
    test = scipy.sparse.load_npz(
        os.path.join(get_root_path(), "features",
                     "test" + suffix + type_name + ".npz")).toarray()

    black_l = np.ones((black.shape[0], ))
    white_l = np.zeros((white.shape[0], ))
    train_data = pd.DataFrame(np.concatenate((black, white), axis=0))

    label = pd.DataFrame(np.concatenate((black_l, white_l), axis=0))

    test_df = pd.DataFrame(test)

    black_name_list = load_dict(
        os.path.join(get_root_path(), "features",
                     "black_name_list" + suffix + type_name))
    white_name_list = load_dict(
        os.path.join(get_root_path(), "features",
                     "white_name_list" + suffix + type_name))
    train_name_list = np.concatenate((black_name_list, white_name_list),
                                     axis=0)

    test_name_list = load_dict(
        os.path.join(get_root_path(), "features",
                     "test_name_list" + suffix + type_name))

    train_data["file_name"] = train_name_list
    train_data["file_name"] = train_data["file_name"].map(
        lambda x: extract_id_from_file_name(x))

    test_df["file_name"] = test_name_list
    test_df["file_name"] = test_df["file_name"].map(
        lambda x: extract_id_from_file_name(x))

    return train_data, label, test_df
Ejemplo n.º 14
0
def attribution(suffix="_dll",
                use_less_value=False,
                type_name="",
                map_func=None,
                max_feature=2000):
    api_vec, data = train_tf_idf(suffix="_dll",
                                 use_less_value=use_less_value,
                                 map_func=map_func,
                                 max_feature=max_feature)

    white, black, test = data

    black_output, name_list = to_str(black, mode=1, column_name="value")
    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "black_name_list" + suffix + type_name))
    black_output = api_vec.transform(black_output)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "black" + suffix + type_name + ".npz"), black_output)

    white_output, name_list = to_str(white, mode=1, column_name="value")
    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "white_name_list" + suffix + type_name))
    white_output = api_vec.transform(white_output)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "white" + suffix + type_name + ".npz"), white_output)

    test_str, name_list = to_str(test, mode=1, column_name="value")
    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "test_name_list" + suffix + type_name))
    test_output = api_vec.transform(test_str)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "test" + suffix + type_name + ".npz"), test_output)
Ejemplo n.º 15
0
                                                     axis=0,
                                                     kind="mergesort")['prob']
        for i in file_name_list
    ]
    if sum:
        exp = [i for i in prob]
        final_prob = np.sum(exp, axis=0) / len(exp)
    else:
        exp = [sigmoid_ver(i) for i in prob]
        final_prob = sigmoid(np.sum(exp, axis=0) / len(exp))
    us_df = pd.read_csv(
        os.path.join(get_root_path(), "prediction", time_str,
                     file_name_list[0])).sort_values(
                         by="seller_id", axis=0,
                         kind="mergesort").sort_values(by="user_id",
                                                       axis=0,
                                                       kind="mergesort")
    us_df['prob'] = final_prob
    us_df.to_csv(os.path.join(get_root_path(), "prediction", time_str,
                              "blending.csv"),
                 index=False,
                 float_format='%.16f')


if __name__ == '__main__':
    name = "new"
    blend(file_name_list=get_file_list_in_dir(
        os.path.join(get_root_path(), "prediction", name)),
          time_str=name,
          sum=False)
Ejemplo n.º 16
0
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from basic_function import load_dict, load_df, get_root_path, save_dict
from metrics import com_acc
from model import get_model
from shorten_api_list import delete_repeat_pattern, delete_same_pattern
import pickle

shape = (512, 64)
input_dim = 92 + 1
batch_size = 32
epochs = 50
class_num = 2

api_dict = load_dict(os.path.join(get_root_path(), "features", "api_dict.txt"))
white = load_df(os.path.join(get_root_path(), "features", "white"),
                mode=1)[['file_name', 'api_name', 'call_time']]
black = load_df(os.path.join(get_root_path(), "features", "black"),
                mode=1)[['file_name', 'api_name', 'call_time']]

white_label = np.zeros(white.shape[0])
black_label = np.ones(black.shape[0])

full = pd.concat([white, black], sort=False)
label = np.concatenate((white_label, black_label))
full['label'] = label

full['api_name'] = full['api_name'].map(api_dict)

# full = load_df(os.path.join(get_root_path(), "features", "stage2"), mode=1)[['file_name', 'api_name', 'call_time']]