Ejemplo n.º 1
0
def test_create_hashing_vectorizer_nb():
    model = create_model('hashing_vectorizer-nb')
    vec = model.steps[0][1]
    clf = model.steps[1][1]
    assert isinstance(model, Pipeline)
    assert isinstance(vec, HashingVectorizer)
    assert isinstance(clf, OneVsRestClassifier)
Ejemplo n.º 2
0
def test_create_binaryrelevance_tfidf_knn():
    model = create_model('binaryrelevance-tfidf-knn')
    vec = model.steps[0][1]
    br = model.steps[1][1]
#    clf = br.get_params()['classifier']
    assert isinstance(model, Pipeline)
    assert isinstance(vec, TfidfVectorizer)
    assert isinstance(br, BinaryRelevance)
Ejemplo n.º 3
0
def test_create_labelpowerset_tfidf_svm():
    model = create_model('labelpowerset-tfidf-svm')
    vec = model.steps[0][1]
    lp = model.steps[1][1]
    clf = lp.get_params()['classifier']
    assert isinstance(model, Pipeline)
    assert isinstance(vec, TfidfVectorizer)
    assert isinstance(lp, LabelPowerset)
    assert isinstance(clf, SVC)
Ejemplo n.º 4
0
def test_create_hashing_vectorizer_svm():
    model = create_model('hashing_vectorizer-svm')
    vec = model.steps[0][1]
    ovr = model.steps[1][1]
    clf = ovr.get_params()['estimator']
    assert isinstance(model, Pipeline)
    assert isinstance(vec, HashingVectorizer)
    assert isinstance(ovr, OneVsRestClassifier)
    assert isinstance(clf, SGDClassifier)
Ejemplo n.º 5
0
def test_create_sent2vec_sgd():
    model = create_model('sent2vec-sgd')
    vec = model.steps[0][1]
    ovr = model.steps[1][1]
    clf = ovr.get_params()['estimator']
    assert isinstance(model, Pipeline)
    assert isinstance(vec, Sent2VecVectorizer)
    assert isinstance(ovr, OneVsRestClassifier)
    assert isinstance(clf, SGDClassifier)
Ejemplo n.º 6
0
def test_create_tfidf_gboost():
    model = create_model('tfidf-gboost')
    vec = model.steps[0][1]
    ovr = model.steps[1][1]
    clf = ovr.get_params()['estimator']
    assert isinstance(model, Pipeline)
    assert isinstance(vec, TfidfVectorizer)
    assert isinstance(ovr, OneVsRestClassifier)
    assert isinstance(clf, GradientBoostingClassifier)
Ejemplo n.º 7
0
def test_create_classifierchain_tfidf_svm():
    model = create_model('classifierchain-tfidf-svm')
    vec = model.steps[0][1]
    cc = model.steps[1][1]
    clf = cc.get_params()['classifier']
    assert isinstance(model, Pipeline)
    assert isinstance(vec, TfidfVectorizer)
    assert isinstance(cc, ClassifierChain)
    assert isinstance(clf, SVC)
Ejemplo n.º 8
0
def model_path(tmp_path, label_binarizer_path):
    model = create_model("mesh-cnn")
    label_binarizer = load_pickle(label_binarizer_path)

    Y_vec = label_binarizer.transform(Y)
    model.fit(X, Y_vec)

    model_path = os.path.join(tmp_path, "model")
    model.save(model_path)
    return model_path
Ejemplo n.º 9
0
def test_create_scibert_svm():
    model = create_model('scibert-svm')
    vec = model.steps[0][1]
    ovr = model.steps[1][1]
    clf = ovr.get_params()['estimator']
    assert isinstance(model, Pipeline)
    assert isinstance(vec, BertVectorizer)
    assert isinstance(ovr, OneVsRestClassifier)
    assert isinstance(clf, SVC)
    assert vec.pretrained == 'scibert'
Ejemplo n.º 10
0
def test_create_sent2vec_tfidf_sgd():
    model = create_model('sent2vec-tfidf-sgd')
    vec = model.steps[0][1]
    ovr = model.steps[1][1]
    sent2vec = vec.transformer_list[0][1].steps[0][1]
    tfidf = vec.transformer_list[1][1].steps[0][1]
    sgd = ovr.get_params()['estimator']
    assert isinstance(model, Pipeline)
    assert isinstance(sent2vec, Sent2VecVectorizer)
    assert isinstance(tfidf, TfidfVectorizer)
    assert isinstance(sgd, SGDClassifier)
Ejemplo n.º 11
0
def test_create_tfidf_onehot_scheme_svm():
    model = create_model('tfidf+onehot_team-svm')
    vec = model.steps[0][1]
    ovr = model.steps[1][1]
    tfidf = vec.transformer_list[0][1].steps[1][1]
    onehot = vec.transformer_list[1][1].steps[1][1]
    sgd = ovr.get_params()['estimator']
    assert isinstance(model, Pipeline)
    assert isinstance(onehot, OneHotEncoder)
    assert isinstance(tfidf, TfidfVectorizer)
    assert isinstance(sgd, SVC)
Ejemplo n.º 12
0
def train(
    train_data_path,
    label_binarizer_path,
    approach,
    parameters=None,
    model_path=None,
    threshold=None,
    sparse_labels=False,
    cache_path=None,
    data_format="list",
    verbose=True,
):
    """
    train_data_path: path. path to JSONL data that contains "text" and "tags" fields.
    label_binarizer_path: path. path to load or store label_binarizer.
    approach: str. approach to use for modelling e.g. tfidf-svm or bert.
    parameters: str. a stringified dict that contains params that get passed to the model.
    model_path: path. path to save the model.
    threshold: float, default 0.5. Probability on top of which a tag is assigned.
    sparse_labels: bool, default False. whether tags (labels) would be sparse for memory efficiency.
    cache_path: path, default None. path to use for caching data transformations for speed.
    data_format: str, default list. one of list, generator. generator used for memory efficiency.
    """

    if os.path.exists(label_binarizer_path):
        print(f"{label_binarizer_path} exists. Loading existing")
        with open(label_binarizer_path, "rb") as f:
            label_binarizer = pickle.loads(f.read())
    else:
        label_binarizer = create_label_binarizer(
            train_data_path, label_binarizer_path, sparse_labels
        )

    model = create_model(approach, parameters)

    # X can be (numpy arrays, lists) or generators
    X_train, _, Y_train, _ = load_train_test_data(
        train_data_path, label_binarizer, data_format=data_format
    )
    model.fit(X_train, Y_train)

    if model_path:
        if str(model_path).endswith("pkl") or str(model_path).endswith("pickle"):
            with open(model_path, "wb") as f:
                f.write(pickle.dumps(model))
        else:
            if not os.path.exists(model_path):
                Path(model_path).mkdir(parents=True, exist_ok=True)
            model.save(model_path)
Ejemplo n.º 13
0
def test_create_mesh_xlinear():
    model = create_model('mesh-xlinear')
    assert isinstance(model, MeshXLinear)
Ejemplo n.º 14
0
def test_create_scibert():
    model = create_model('scibert')
    assert isinstance(model, BertClassifier)
    assert model.pretrained == 'scibert'
Ejemplo n.º 15
0
def test_create_tfidf_transformers_svm():
    model = create_model('tfidf-transformers-svm')
    assert isinstance(model, TfidfTransformersSVM)
Ejemplo n.º 16
0
def test_create_spacy_textclassifier():
    model = create_model("spacy-textclassifier")
    assert isinstance(model, SpacyClassifier)
Ejemplo n.º 17
0
def test_create_mesh_cnn():
    model = create_model('mesh-cnn')
    assert isinstance(model, MeshCNN)
Ejemplo n.º 18
0
def test_create_cnn():
    model = create_model('cnn')
    vec = model.steps[0][1]
    clf = model.steps[1][1]
    assert isinstance(vec, KerasVectorizer)
    assert isinstance(clf, CNNClassifier)
Ejemplo n.º 19
0
def test_create_bilstm():
    model = create_model('bilstm')
    vec = model.steps[0][1]
    clf = model.steps[1][1]
    assert isinstance(vec, KerasVectorizer)
    assert isinstance(clf, BiLSTMClassifier)
Ejemplo n.º 20
0
def test_create_mesh_tfidf_svm():
    model = create_model('mesh-tfidf-svm')
    assert isinstance(model, MeshTfidfSVM)
Ejemplo n.º 21
0
def test_create_bert():
    model = create_model('bert')
    assert isinstance(model, BertClassifier)
    assert model.pretrained == 'bert-base-uncased'