def test_create_hashing_vectorizer_nb(): model = create_model('hashing_vectorizer-nb') vec = model.steps[0][1] clf = model.steps[1][1] assert isinstance(model, Pipeline) assert isinstance(vec, HashingVectorizer) assert isinstance(clf, OneVsRestClassifier)
def test_create_binaryrelevance_tfidf_knn(): model = create_model('binaryrelevance-tfidf-knn') vec = model.steps[0][1] br = model.steps[1][1] # clf = br.get_params()['classifier'] assert isinstance(model, Pipeline) assert isinstance(vec, TfidfVectorizer) assert isinstance(br, BinaryRelevance)
def test_create_labelpowerset_tfidf_svm(): model = create_model('labelpowerset-tfidf-svm') vec = model.steps[0][1] lp = model.steps[1][1] clf = lp.get_params()['classifier'] assert isinstance(model, Pipeline) assert isinstance(vec, TfidfVectorizer) assert isinstance(lp, LabelPowerset) assert isinstance(clf, SVC)
def test_create_hashing_vectorizer_svm(): model = create_model('hashing_vectorizer-svm') vec = model.steps[0][1] ovr = model.steps[1][1] clf = ovr.get_params()['estimator'] assert isinstance(model, Pipeline) assert isinstance(vec, HashingVectorizer) assert isinstance(ovr, OneVsRestClassifier) assert isinstance(clf, SGDClassifier)
def test_create_sent2vec_sgd(): model = create_model('sent2vec-sgd') vec = model.steps[0][1] ovr = model.steps[1][1] clf = ovr.get_params()['estimator'] assert isinstance(model, Pipeline) assert isinstance(vec, Sent2VecVectorizer) assert isinstance(ovr, OneVsRestClassifier) assert isinstance(clf, SGDClassifier)
def test_create_tfidf_gboost(): model = create_model('tfidf-gboost') vec = model.steps[0][1] ovr = model.steps[1][1] clf = ovr.get_params()['estimator'] assert isinstance(model, Pipeline) assert isinstance(vec, TfidfVectorizer) assert isinstance(ovr, OneVsRestClassifier) assert isinstance(clf, GradientBoostingClassifier)
def test_create_classifierchain_tfidf_svm(): model = create_model('classifierchain-tfidf-svm') vec = model.steps[0][1] cc = model.steps[1][1] clf = cc.get_params()['classifier'] assert isinstance(model, Pipeline) assert isinstance(vec, TfidfVectorizer) assert isinstance(cc, ClassifierChain) assert isinstance(clf, SVC)
def model_path(tmp_path, label_binarizer_path): model = create_model("mesh-cnn") label_binarizer = load_pickle(label_binarizer_path) Y_vec = label_binarizer.transform(Y) model.fit(X, Y_vec) model_path = os.path.join(tmp_path, "model") model.save(model_path) return model_path
def test_create_scibert_svm(): model = create_model('scibert-svm') vec = model.steps[0][1] ovr = model.steps[1][1] clf = ovr.get_params()['estimator'] assert isinstance(model, Pipeline) assert isinstance(vec, BertVectorizer) assert isinstance(ovr, OneVsRestClassifier) assert isinstance(clf, SVC) assert vec.pretrained == 'scibert'
def test_create_sent2vec_tfidf_sgd(): model = create_model('sent2vec-tfidf-sgd') vec = model.steps[0][1] ovr = model.steps[1][1] sent2vec = vec.transformer_list[0][1].steps[0][1] tfidf = vec.transformer_list[1][1].steps[0][1] sgd = ovr.get_params()['estimator'] assert isinstance(model, Pipeline) assert isinstance(sent2vec, Sent2VecVectorizer) assert isinstance(tfidf, TfidfVectorizer) assert isinstance(sgd, SGDClassifier)
def test_create_tfidf_onehot_scheme_svm(): model = create_model('tfidf+onehot_team-svm') vec = model.steps[0][1] ovr = model.steps[1][1] tfidf = vec.transformer_list[0][1].steps[1][1] onehot = vec.transformer_list[1][1].steps[1][1] sgd = ovr.get_params()['estimator'] assert isinstance(model, Pipeline) assert isinstance(onehot, OneHotEncoder) assert isinstance(tfidf, TfidfVectorizer) assert isinstance(sgd, SVC)
def train( train_data_path, label_binarizer_path, approach, parameters=None, model_path=None, threshold=None, sparse_labels=False, cache_path=None, data_format="list", verbose=True, ): """ train_data_path: path. path to JSONL data that contains "text" and "tags" fields. label_binarizer_path: path. path to load or store label_binarizer. approach: str. approach to use for modelling e.g. tfidf-svm or bert. parameters: str. a stringified dict that contains params that get passed to the model. model_path: path. path to save the model. threshold: float, default 0.5. Probability on top of which a tag is assigned. sparse_labels: bool, default False. whether tags (labels) would be sparse for memory efficiency. cache_path: path, default None. path to use for caching data transformations for speed. data_format: str, default list. one of list, generator. generator used for memory efficiency. """ if os.path.exists(label_binarizer_path): print(f"{label_binarizer_path} exists. Loading existing") with open(label_binarizer_path, "rb") as f: label_binarizer = pickle.loads(f.read()) else: label_binarizer = create_label_binarizer( train_data_path, label_binarizer_path, sparse_labels ) model = create_model(approach, parameters) # X can be (numpy arrays, lists) or generators X_train, _, Y_train, _ = load_train_test_data( train_data_path, label_binarizer, data_format=data_format ) model.fit(X_train, Y_train) if model_path: if str(model_path).endswith("pkl") or str(model_path).endswith("pickle"): with open(model_path, "wb") as f: f.write(pickle.dumps(model)) else: if not os.path.exists(model_path): Path(model_path).mkdir(parents=True, exist_ok=True) model.save(model_path)
def test_create_mesh_xlinear(): model = create_model('mesh-xlinear') assert isinstance(model, MeshXLinear)
def test_create_scibert(): model = create_model('scibert') assert isinstance(model, BertClassifier) assert model.pretrained == 'scibert'
def test_create_tfidf_transformers_svm(): model = create_model('tfidf-transformers-svm') assert isinstance(model, TfidfTransformersSVM)
def test_create_spacy_textclassifier(): model = create_model("spacy-textclassifier") assert isinstance(model, SpacyClassifier)
def test_create_mesh_cnn(): model = create_model('mesh-cnn') assert isinstance(model, MeshCNN)
def test_create_cnn(): model = create_model('cnn') vec = model.steps[0][1] clf = model.steps[1][1] assert isinstance(vec, KerasVectorizer) assert isinstance(clf, CNNClassifier)
def test_create_bilstm(): model = create_model('bilstm') vec = model.steps[0][1] clf = model.steps[1][1] assert isinstance(vec, KerasVectorizer) assert isinstance(clf, BiLSTMClassifier)
def test_create_mesh_tfidf_svm(): model = create_model('mesh-tfidf-svm') assert isinstance(model, MeshTfidfSVM)
def test_create_bert(): model = create_model('bert') assert isinstance(model, BertClassifier) assert model.pretrained == 'bert-base-uncased'