Example #1
0
def main(argv=None):
    logger = log.get_logger(__name__)
    if argv == None:
        argv = sys.argv[1:]
    args = __parse_arguments(argv)
    df = pd.DataFrame.from_csv(args.data_location)
    x = df[df.columns[:-1]]
    y = df["class"]
    y = pd.DataFrame(y, columns=['class'])
    #encoder = prep.LabelEncoder()
    #y = pd.DataFrame(encoder.fit_transform(y), columns=["class"])
    #print("Classes %s", str(encoder.classes_))
    x_train, x_test, y_train, y_test = cv.train_test_split(x,
                                                           y,
                                                           test_size=0.3,
                                                           stratify=y.values)
    config = [
        {
            "C":[0.1, 0.5, 1, 2, 4, 8, 10, 100],
            "kernel": ["linear"],
            "probability": [True]
        },
        {
            "C":[0.1, 0.5, 1, 2, 4, 8, 10, 100],
            "kernel": ["poly"],
            "degree": [2, 3, 4, 5],
            "coef0": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 2, 4, 8, 16],
            "probability": [True],
        },
        {
            "C":[0.1, 0.5, 1, 2, 4, 8, 10, 100],
            "kernel": ["sigmoid"],
            "probability": [True],

        },
        {
            "C":[0.1, 0.5, 1, 2, 4, 8, 10, 100],
            "kernel": ["rbf"],
            "probability": [True],
        }
    ]
    logger.info("training")
    if args.config is not None:
        with open(args.config) as config_file:
            config = json.load(config_file)
    model = train_model(x_train, y_train, config)
    logger.info("trained")
    y_predict = model.predict(x_test)
    logger.info("true: %s predicted: %s", str(y_test["class"].values), str(y_predict))
    score = metrics.f1_score(y_test, y_predict,
                             average="macro")
    print("Simple score {0}".format(model.score(x_test, y_test)))
    print("F1 score {0}".format(score))
    model = train_model(x, y, model.best_params_)
    joblib.dump(model, args.output)
Example #2
0
def train_model(x, y, model_parameters, model_cls=svm.SVC):
    """Trains a model on a *training set*.
       Parameters:
       *training_set* should be a dataframe-like object
       *args* should either be a dict, or a list of dicts. If a dict is provided,
       it will be used for initializing a model. If a list is provided,
       the gridsearch will be run on those grids.
       *model_cls* should be a sklearn class for a desired classifier. Default
       is SVC.
       Returns a *sklearn* model."""
    logger = log.get_logger(__name__)
    if isinstance(model_parameters, list):
        leave_p_out = cv.LeavePOut(len(x), 4)
        model = grid_search.GridSearchCV(model_cls(),
                                         model_parameters,
                                         cv=3, n_jobs=4,
                                         scoring="f1_macro",
                                         )
        model.fit(x.values, y["class"].values)
        print("Ideal parameters {0}".format(model.best_params_))
    else:
        model = model_cls(**model_parameters)
        model.fit(x.values, y["class"].values)
    return model