Esempio n. 1
0
def main():
    np.random.seed(args.seed)

    print("Reading params.yaml...")
    params = yaml.safe_load(open("params.yaml"))["train"][args.model]

    print("Reading training set...")
    with open(args.sentences_file, "r") as f:
        corpus = f.readlines()

    out_dir = Path(args.output_dir)
    os.makedirs(out_dir, exist_ok=True)

    if args.model == "tf_idf":
        model = TfidfVectorizer(**params["init_kwargs"])
        print("Training model...")
        model.fit(corpus)
        # hack: https://github.com/scikit-learn/scikit-learn/issues/18669
        model.vocabulary_ = OrderedDict(
            sorted(model.vocabulary_.items(), key=lambda kv: kv[1]))
        model._stop_words_id = 0
        print("Saving model to disk...")
        with (out_dir / "model.pkl").open("wb") as f:
            pickle.dump(model, f)
    elif args.model == "count":
        model = CountVectorizer(**params["init_kwargs"])
        print("Training model...")
        model.fit(corpus)
        # hack: https://github.com/scikit-learn/scikit-learn/issues/18669
        model.vocabulary_ = OrderedDict(
            sorted(model.vocabulary_.items(), key=lambda kv: kv[1]))
        model._stop_words_id = 0
        print("Saving model to disk...")
        with (out_dir / "model.pkl").open("wb") as f:
            pickle.dump(model, f)
    else:
        raise ValueError(f"Training not available for model {args.model}!")

    print("Training completed!")