Example #1
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    checked_args = check_argument_parsing(args)

    multi = True if args.objective == "multi-metrics" else False

    if checked_args.task == "train":
        df = load_dataset(args.fname)

        # vectorization
        if not (args.model_type == "rnn"):
            x, y = vectorizes_features(df), vectorizes_label(df, multi)
        else:
            mol2vec = word2vec.Word2Vec.load('models/model_300dim.pkl')
            y = vectorizes_label(df, multi)
            x = vec_mol2vec_smile(df["smiles"].tolist(), mol2vec)

        # train-test-split
        x_train, x_test, y_train, y_test = train_test_split(
            x,
            y,
            stratify=y if not multi else None,
            train_size=0.9,
            random_state=args.random_state)

        if not os.path.isdir(args.output_dir):
            os.mkdir(args.output_dir)
        # useful for restesting on same array
        for arr, arr_name in ((x_test, "x_test.npy"), (y_test, "y_test.npy")):
            np.save(os.path.join(args.output_dir, arr_name), arr)

        if args.model_type == "dummy":
            clf = DummyClassifier(strategy="most_frequent")
            clf.fit(x_train, y_train)
            score = clf.score(x_test, y_test)

            with open(args.model, "wb") as f:
                pickle.dump(clf, f)

        elif args.model_type == "mlp":
            with open(args.hyperparameters) as f:
                hp = json.load(f)
            clf = get_mlp(1 if args.objective == "single-metrics" else 9,
                          hp["neurons"], hp["dropout_rate"], hp["activation"])
            clf.compile(optimizer=tf.keras.optimizers.Adam(
                learning_rate=hp["learning_rate"]),
                        loss="categorical_crossentropy",
                        metrics=['accuracy'])
            clf.fit(x_train,
                    y_train,
                    batch_size=hp["batch_size"],
                    epochs=hp["epochs"],
                    validation_data=(x_test, y_test),
                    callbacks=get_callbacks(args.model))
            score = clf.evaluate(x_test, y_test,
                                 batch_size=hp["batch_size"])[1]

        elif args.model_type == "rnn":
            with open(args.hyperparameters) as f:
                hp = json.load(f)
            clf = get_rnn(1 if args.objective == "single-metrics" else 9,
                          hp["neurons"], hp["dropout_rate"])
            clf.compile(optimizer=tf.keras.optimizers.Adam(
                learning_rate=hp["learning_rate"]),
                        loss="categorical_crossentropy",
                        metrics=['accuracy'])
            clf.fit(x_train,
                    y_train,
                    batch_size=hp["batch_size"],
                    epochs=hp["epochs"],
                    validation_data=(x_test, y_test),
                    callbacks=get_callbacks(args.model))
            score = clf.evaluate(x_test, y_test,
                                 batch_size=hp["batch_size"])[1]

        results = {
            "model-type": args.model_type,
            "random_state": args.random_state,
            "score": float(score)
        }
        with open(os.path.join(args.output_dir, "resultats.json"), "w") as f:
            json.dump(results, f)

    else:
        if args.model_type == "dummy":
            with open(args.model, "rb") as f:
                clf = pickle.load(f)
        elif args.model_type == "mlp":
            clf = tf.keras.Model.load_model(args.model)
        elif args.model_type == "rnn":
            clf = tf.keras.load_model(args.model)
            mol2vec = word2vec.Word2Vec.load('models/model_300dim.pkl')

        if checked_args.task == "predict":
            if args.model_type in ["dummy", "mlp"]:
                try:
                    x = vectorizes_smile(checked_args.smile)
                    print("mol: {}, {}".format(args.smile, clf.predict(x)))
                except Exception() as e:
                    print(e)
            else:
                x = vec_mol2vec_smile([args.smile], mol2vec)

        if checked_args.task == "evaluate":
            df = load_dataset(args.fname)
            if args.model_type == "rnn":
                x_test = vec_mol2vec_smile(df["smiles"].tolist())
            else:
                x_test = vectorizes_features(df)
            y_test = vectorizes_label(df, multi)
            score = clf.score(x_test, y_test)
            print(score)

        # API
        elif checked_args.task == "server-start":
            app = get_app(clf, args.model_type, args.mol2vec)
            app.run()