コード例 #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--datadir", help="Features Directory", type=str)
    parser.add_argument("-o", "--output", help="output Directory", type=str)
    args = parser.parse_args()

    if not os.path.exists(args.datadir):
        parser.error("{} is not a directory".format(args.datadir))
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    #Get total lines from feature.jsonl
    rows = 0
    with jsonlines.open(os.path.join(args.datadir,
                                     'features.jsonl')) as reader:
        for obj in reader.iter(type=dict, skip_invalid=True):
            rows += 1

    clear(args.datadir)
    ember.create_vectorized_features(args.datadir, rows)

    # Train and save model
    print("Training LightGBM model")
    lgbm_model = ember.train_model(args.datadir, rows)
    lgbm_model.save_model(os.path.join(args.output, "model.txt"))
コード例 #2
0
ファイル: init_ember.py プロジェクト: zangobot/ember
def main():
    prog = "train_ember"
    descr = "Train an ember model from a directory with raw feature files"
    parser = argparse.ArgumentParser(prog=prog, description=descr)
    parser.add_argument("-v",
                        "--featureversion",
                        type=int,
                        default=2,
                        help="EMBER feature version")
    parser.add_argument("-m",
                        "--metadata",
                        action="store_true",
                        help="Create metadata CSVs")
    parser.add_argument("-t",
                        "--train",
                        action="store_true",
                        help="Train an EMBER model")
    parser.add_argument("datadir",
                        metavar="DATADIR",
                        type=str,
                        help="Directory with raw features")
    parser.add_argument("--optimize",
                        help="gridsearch to find best parameters",
                        action="store_true")
    args = parser.parse_args()

    if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir):
        parser.error("{} is not a directory with raw feature files".format(
            args.datadir))

    X_train_path = os.path.join(args.datadir, "X_train.dat")
    y_train_path = os.path.join(args.datadir, "y_train.dat")
    if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
        print("Creating vectorized features")
        ember.create_vectorized_features(args.datadir, args.featureversion)

    if args.metadata:
        ember.create_metadata(args.datadir)

    if args.train:
        params = {
            "boosting": "gbdt",
            "objective": "binary",
            "num_iterations": 1000,
            "learning_rate": 0.05,
            "num_leaves": 2048,
            "max_depth": 15,
            "min_data_in_leaf": 50,
            "feature_fraction": 0.5
        }
        if args.optimize:
            params = ember.optimize_model(args.datadir)
            print("Best parameters: ")
            print(json.dumps(params, indent=2))

        print("Training LightGBM model")
        lgbm_model = ember.train_model(args.datadir, params,
                                       args.featureversion)
        lgbm_model.save_model(os.path.join(args.datadir, "model.txt"))
コード例 #3
0
ファイル: train_ember.1.py プロジェクト: sherplus/ember
def main():
    datadir = '/home/mira/research/dataset/ember.2'

    if not os.path.exists(datadir) or not os.path.isdir(datadir):
        print("not a path")

    X_train_path = os.path.join(datadir, "X_train.dat")
    y_train_path = os.path.join(datadir, "y_train.dat")
    if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
        print("[{}] Creating vectorized features".format(
            datetime.datetime.now()))
        ember.create_vectorized_features(datadir)

    print("[{}] Training LightGBM model".format(datetime.datetime.now()))
    lgbm_model = ember.train_model(datadir)
    lgbm_model.save_model(os.path.join(datadir, "model.txt"))
    print("[{}] Done".format(datetime.datetime.now()))
コード例 #4
0
def train_multiple(data_dir):
    """
    Train a bunch of models to explore how different they are
    """
    params = {
        "boosting": "gbdt",
        "objective": "binary",
        "num_iterations": 1000,
        "learning_rate": 0.05,
        "num_leaves": 2048,
        "feature_fraction": 0.5,
        "bagging_fraction": 1.0,
        "max_depth": 15,
        "min_data_in_leaf": 50
    }
    for i in range(10):
        lgbm_model = ember.train_model(data_dir, params, 2)
        lgbm_model.save_model(
            os.path.join(data_dir, f"ember_model_2018_random{i}.txt"))
コード例 #5
0
def main():
    prog = "train_ember"
    descr = "Train an ember model from a directory with raw feature files"
    parser = argparse.ArgumentParser(prog=prog, description=descr)
    parser.add_argument("datadir", metavar="DATADIR", type=str, help="Directory with raw features")
    args = parser.parse_args()

    if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir):
        parser.error("{} is not a directory with raw feature files".format(args.datadir))

    X_train_path = os.path.join(args.datadir, "X_train.dat")
    y_train_path = os.path.join(args.datadir, "y_train.dat")
    if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
        print("Creating vectorized features")
        ember.create_vectorized_features(args.datadir)

    print("Training LightGBM model")
    lgbm_model = ember.train_model(args.datadir)
    lgbm_model.save_model(os.path.join(args.datadir, "model.txt"))
コード例 #6
0
 def trainModel(self, vectorizedDataDir):
     self.lgbm_model = ember.train_model(vectorizedDataDir)
     return self.lgbm_model
コード例 #7
0
ファイル: main.py プロジェクト: SYHPARK/maldetect
                         y_train[train_rows],
                         epochs=3,
                         verbose=2,
                         validation_data=(x_test, y_test))

# In[ ]:

y_binary = to_categorical(y_test)
print(y_binary.shape)

# In[ ]:

# EMBER model

params = {
    "boosting": "gbdt",
    "objective": "binary",
    "num_iterations": 1000,
    "learning_rate": 0.05,
    "num_leaves": 2048,
    "max_depth": 15,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.5
}

print("training lightGBM model")
lgbm_model = ember.train_model(datadir, params, 2)
lgbm_model.save_model(os.path.join(datadir, "model.txt"))

# In[ ]:
コード例 #8
0
def main():
    prog = "train_ember"
    descr = "Train an ember model from a directory with raw feature files"
    parser = argparse.ArgumentParser(prog=prog, description=descr)
    parser.add_argument("--modelname",
                        type=str,
                        default="SGD",
                        help="Model name")
    parser.add_argument("-v",
                        "--featureversion",
                        type=int,
                        default=2,
                        help="EMBER feature version")
    parser.add_argument("datadir",
                        metavar="DATADIR",
                        type=str,
                        help="Directory with raw features")
    parser.add_argument("--optimize",
                        help="gridsearch to find best parameters",
                        action="store_true")
    args = parser.parse_args()

    if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir):
        parser.error("{} is not a directory with raw feature files".format(
            args.datadir))

    X_train_path = os.path.join(args.datadir,
                                f"X_train_{args.featureversion}.dat")
    y_train_path = os.path.join(args.datadir,
                                f"y_train_{args.featureversion}.dat")
    # if they don't exist, compute them.
    if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
        print("Creating vectorized features")
        ember.create_vectorized_features(args.datadir, args.featureversion)

    #feature_name = ['feature_' + str(col) for col in range(num_feature)]

    params = {
        "boosting": "gbdt",
        "objective": "regression",
        "num_iterations": 1000,
        "learning_rate": 0.05,
        "num_leaves": 2048,
        "max_depth": 15,
        "min_data_in_leaf": 50,
        "feature_fraction": 0.5,
        "num_threads": 2,
    }
    if args.optimize:
        params = ember.optimize_model(args.datadir)
        print("Best parameters: ")
        print(json.dumps(params, indent=2))

    print("Training Classifier model")
    lgbm_model = ember.train_model(args.datadir, params, args.featureversion)

    # Save to file in the current working directory
    #pkl_filename = os.path.join(args.datadir,f"{args.modelname}_model_{args.featureversion}.pkl")
    # with open(pkl_filename, 'wb') as f:
    #    pickle.dump(lgbm_model, f)
    print(f"file dumped into model.txt .... ")
    lgbm_model.save_model(
        os.path.join(args.datadir, f"model_{args.featureversion}.txt"))

    print('Plotting feature importances...')
    ax = lgb.plot_importance(lgbm_model, max_num_features=10)
    plt.savefig(f'lgbm_importances-0{args.featureversion}.png')

    # run
    os.system(f"xdg-open lgbm_importances-0{args.featureversion}.png")