def optimise_model(x_train, x_val, y_train, y_val, penalty, grid):
    ''' Optimise model with a grid search, using hold out set for validation,
    and evaluating with AUROC. '''

    if penalty == 'l2':
        model = OneVsRestClassifier(LogisticRegression(max_iter=2000))

    elif penalty == 'l1':
        model = OneVsRestClassifier(
            LogisticRegression(max_iter=2000,
                               penalty='l1',
                               solver='saga',
                               tol=0.01))  # keeps training time down

    # perform grid search
    best_C = grid_search(x_train, x_val, y_train, y_val, model, grid)

    # concatenate training and validation sets
    x = np.concatenate((x_train, x_val))
    y = np.concatenate((y_train, y_val))

    # train final model on all non-test data with optimal C
    model.set_params(estimator__C=best_C)
    start = time.time()
    model.fit(x, y)
    end = time.time()
    train_time = round(end - start, 2)

    return model, train_time
Ejemplo n.º 2
0
def refit_model(model_instance, best_params: dict, x_train: list,
                y_train: list, binary: bool):
    if binary:
        model_instance = OneVsRestClassifier(model_instance)
    model_instance.set_params(**best_params)
    model_instance.fit(x_train, y_train)
    return model_instance
Ejemplo n.º 3
0
def optimize(log, filename, progress=False):
    log.info("getting data")
    data, labels = extract_mails.get_training_data(progress)
    log.info("splitting data")
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.4,
                                                        random_state=0)

    log.info("preprocessing data")
    vectorizer = CountVectorizer()
    vectorizer.fit(data)
    X = vectorizer.transform(x_train)
    binarizer = MultiLabelBinarizer()
    binarizer.fit(labels)
    Y = binarizer.transform(y_train)

    # do a gridsearch for the best parameters
    log.info("doing gridsearch... this may take some time")
    pipe = Pipeline([
        ('feature_selection',
         SelectFromModel(RandomForestClassifier(n_estimators=10))),
        ('classification', SVC())
    ])
    clf = OneVsRestClassifier(pipe)
    parameters = {
        "estimator__feature_selection__threshold": ('mean', '0.5*mean', 0),
        "estimator__classification__kernel": ('linear', 'rbf'),
        "estimator__classification__C": (0.01, 0.1, 1, 10, 100)
    }

    grid_search = GridSearchCV(clf,
                               parameters,
                               n_jobs=-1,
                               verbose=1,
                               scoring='f1_samples',
                               error_score=0)
    grid_search.fit(X, Y)

    print grid_search.best_score_
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print "\t{0}: {1}".format(param_name, best_parameters[param_name])

    log.info("evaluating classifier")
    Xt = vectorizer.transform(x_test)
    preds = grid_search.best_estimator_.predict(Xt)
    real = binarizer.transform(y_test)

    print classification_report(real, preds, target_names=binarizer.classes_)

    # store the parameters from the best estimator and the pipeline,
    # so that the next time for training the best pipeline can be
    # used!
    clf.set_params(**best_parameters)
    atomic_pickle(clf, filename)

    return data, labels
def trainPredictorWithWeights(X,Y,ws):
    #print type(X)
    #print type(Y)
    #classif = BinaryRelevance(GaussianNB())

# train
    #classif.fit(X,Y)
    #classif = OneVsRestClassifier(SVC(kernel='linear'))
    classif = OneVsRestClassifier(SVC(kernel='linear'))
    classif.set_params(sample_weight=ws)
#    print classif.get_params()
        
    classif.fit(X, Y)
    return classif
Ejemplo n.º 5
0
def  main():
	(trainX, trainY) = readTrainData()
	(first10columnX, scalars) = normalizeData(trainX[:, 0:10])
	clf = OneVsRestClassifier(svm.SVC(max_iter=16000))
	clfbest = clf.set_params(estimator__kernel='rbf',estimator__gamma=1.0,estimator__C=2.0) 
	trainX = np.concatenate((first10columnX[:,:], trainX[:,10:50]), axis = 1)
	clfbest.fit(trainX, trainY)
	
	'''
	the cross validation can be done manually but extremely time consuming
	on my 2013 Mac, it will take at least 90 min
	the parameters are chosen but a cross validation with much smaller input data.
	'''


	(validateX, validateY) = readTrainData("../forest_validation.csv") 
	first10validateX = normalizeData(validateX[:, 0:10], scalars)[0]
	validateX = np.concatenate((first10validateX[:,:], validateX[:,10:50]), axis = 1)
	predictY = clfbest.predict(validateX)
	resultfile = open('confusionmatrix_test.txt','w')
	resultfile.write(str(confusion_matrix(validateY, predictY)))
	resultfile.write('\n\n\n')
	resultfile.write(metrics.classification_report(validateY, predictY))
	resultfile.close()	
	
	print 'accuracy: %.4f' % ( ( sum( predictY == validateY ) / float(len(predictY)))) 

	testX = readTestData()
	first10testX = normalizeData(testX[:, 0:10], scalars)[0]
	testX = np.concatenate((first10testX[:,:], testX[:,10:50]), axis = 1)
	testY = clfbest.predict(testX)
	write2csv(testY, filename = "output_SVM.csv")
Ejemplo n.º 6
0
class ModelsClassical():
    """
    Класс для моделей классического МО
    """
    def __init__(self, type_class='binary', y=None, type_model='lin', **kwargs):
        """
        Выбор модели
            type_class: {'binary', 'multy'}, default='binary'
                Тип классификации - бинарная или многоклассовая
            y: numpy.ndarray, default=None
                Массив целевой переменной. Нобходим для нахождения уникальных классов в многоклассовой классификации.
            type_model: {'lin', 'lgbm', 'svm'}, default='lin'
                Тип модели
            **kwargs
                Гиперпараметры для моделей
        """
        if type_model == 'lin':
            self.model = LogisticRegression(**kwargs)
        elif type_model == 'lgbm':
            self.model = LGBMClassifier(boosting_type='gbdt', n_jobs=20, **kwargs)
        elif type_model == 'svm':
            # Пока заглушка - метод не реализован
            pass

        if type_class != 'binary' and type_model != 'lgbm':
            self.model = OneVsRestClassifier(self.model)
            self.mlb = MultiLabelBinarizer(classes=sorted(np.unique(y)))
        elif type_class != 'binary' and type_model == 'lgbm':
            self.model.set_params(objective='multiclass', metric='multi_logloss')
        
        self.type_class = type_class
        self.type_model = type_model
        self.kwargs = kwargs
        
    def fit(self, X_train, y_train, X_test=None, y_test=None):
        """
        Обучение модели
            X_train: pandas.DataFrame/numpy.ndarray/scipy.sparse.csr.csr_matrix
                Тренировочный датасет
            X_test: pandas.DataFrame/numpy.ndarray/scipy.sparse.csr.csr_matrix, default=None
                Тестовый датасет
            y_train: pandas.Series/numpy.ndarray
                ЦП соответствующая тренироврочному датасету
            y_test: pandas.Series/numpy.ndarray, default=None
                ЦП соответствующая тестовому датасету
        """
        if self.type_class != 'binary' and self.type_model != 'lgbm':
            self.y_train = self.mlb.fit_transform(y_train.apply(lambda x: [x]))
        
        if self.type_model != 'lgbm':
            self.model.fit(X_train, y_train)
        else:
            self.model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100, verbose=False)
           
    def predict(self, X, threshold=False):
        """
        Прогнозирование класса
            X: pandas.DataFrame/numpy.ndarray/scipy.sparse.csr.csr_matrix
                Датасет
            threshold:
                Порог для класса
                
            return: numpy.ndarray
                Массив с предсказанными классами
        """
        if threshold:
            def __proba_to_tag(line, thresh):
                """
                Внутреняя функция для превращения вероятности в соответствие классу согласно порогу.
                Пример: есть три класса со следующими вероятностями [0.58, 0.32, 0.1]. Первый класс всегда others, в который падают обычно все неинтересующие текста. Класс самый большой. Алогритм результирующего класса следующий. Если вероятность первого класса (others) больше остальных, то берется максимальная вероятность из всех классов, кроме первого (others). Если этот максимум больше порога, то объект относится к этому классу, иначе относится к первому классу.
                    line: numpy.ndarray
                        Массив вероятностей каждого класса
                    thresh: float
                        Порог
                        
                    return: numpy.ndarray
                """
                max_line = max(line)
                
                if line[0] == max_line:
                    max_prob = max(line[1:])
                    if max_prob > thresh:
                        index_max = np.where(line==max_prob)[0][0]
                    else:
                        index_max = 0
                else:
                    index_max = np.where(line==max_line)[0][0]
                
                for tag in range(len(line)):
                    if tag == index_max:
                        line[tag] = 1
                    else:
                        line[tag] = 0   
                        
                return line.astype(int)
            
            y_pred_proba = self.model.predict_proba(X)
            if self.type_class == 'binary':
                y_final = (y_pred_proba[:,-1] > threshold)*1
            else:
                y_pred = np.apply_along_axis(__proba_to_tag, axis=1, arr=y_pred_proba, thresh=threshold)
                y_final = self.__transform_multyclass_to_one_list(y_pred)
            return y_final
        else:
            return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Прогнозирование вероятности класса
            X: pandas.DataFrame/numpy.ndarray/scipy.sparse.csr.csr_matrix
                Датасет
                
            return: numpy.ndarray
                Массив с вероятностями для каждого класса
        """
        return self.model.predict_proba(X)
    
    def transform_multy_class(self, y):
        """
        Преобразование массива к необходимому для подбора гиперпараметров виду. 
        Кажый объект массива оборачивается в список.
            y: numpy.ndarray
                Целевая переменная
                
            return: numpy.ndarray
                Массив с классами, обернутыми в список для каждого объекта
        """
        if self.type_model != 'lgbm':
            return self.mlb.fit_transform(y.apply(lambda x: [x]))
    
    def __transform_multyclass_to_one_list(self, y):
        """
        Трансформация массива, состоящего из массивов (единица стоит на индексе предсказанного класса), к массиву, состоящему просто из номеров классов.
            y: numpy.ndarray
                Целевая переменная
                
            return: numpy.ndarray
                Массив из номеров классов
        """
        def __transform_one_line(line):
            """
            Внутренняя функция для трансформации одного объекта массива
                line: numpy.ndarray
                    Отдельный масив
                    
                return: int
            """
            if np.max(line) == 0:
                return 0
            else:
                return np.where(line == 1)[0][0]
        
        return np.apply_along_axis(__transform_one_line, 1, y)
Ejemplo n.º 7
0
class MultiClassPredictions(InteractionPredictions):
    """
    Class for making and storing OneVsRest predictions
    of synergies and antagonisms
    """
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.class_names = ['none', 'antag', 'syn']
        self.one_vs_rest()

    def one_vs_rest(self):
        self.clf = OneVsRestClassifier(self.clf)

    def set_params(self, **kwargs):
        # for random forest
        self.clf.set_params(**kwargs)
        return self

    def _crossval_iter(self, train, test, cl):
        X_test = self.X[test]
        y_test = self.y[test]
        combs_test = self.combs[test]

        probas_ = self.clf.fit(self.X[train],
                               self.y[train]).predict_proba(X_test)
        # predictions
        pred_dict = {'comb': combs_test}
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        precision = dict()
        recall = dict()
        average_precision = dict()

        n_classes = len(self.class_names)
        for i in range(n_classes):
            pred_dict['score_' + str(self.class_names[i])] = probas_[:, i]
            fpr[i], tpr[i], _ = roc_curve(y_test[:, i], probas_[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
            precision[i], recall[i], _ = precision_recall_curve(
                y_test[:, i], probas_[:, i])
            average_precision[i] = average_precision_score(
                y_test[:, i], probas_[:, i])

        self.fpr[cl] = fpr
        self.tpr[cl] = tpr
        self.auc[cl] = roc_auc
        self.precision[cl] = precision
        self.recall[cl] = recall
        self.avprec[cl] = average_precision
        self.predicted[cl] = pd.DataFrame(pred_dict)

        importances = [
            self.clf.estimators_[i].feature_importances_
            for i in range(n_classes)
        ]

        imp_list = [pd.DataFrame({'feat': np.argsort(imp)[::-1][:self.top],
                                  'importance': np.sort(imp)[::-1][:self.top],
                                  'type': i})\
                    for imp, i in zip(importances, self.class_names)]
        imp_df = pd.concat(imp_list, ignore_index=True)
        self.topfeat[cl] = imp_df

    def aggregate_precision(self):
        index = ['AP_' + lab for lab in self.class_names]
        ap_df = (pd.concat({k: pd.DataFrame(v.values(),
                                            index=index).T \
                   for k,v in self.avprec.items()}).
         reset_index().rename(columns={"level_0": "cvfold"}).
         drop(columns=["level_1"]))
        return ap_df

    def aggregate_auc(self):
        index = ['AUCROC_' + lab for lab in self.class_names]
        auc_df = (pd.concat({k: pd.DataFrame(v.values(),
                                             index=index).T \
                   for k,v in self.auc.items()}).
         reset_index().rename(columns={"level_0": "cvfold"}).
         drop(columns=["level_1"]))

        return auc_df

    def plot_ROC(self,
                 figdir=None,
                 fname=None,
                 title='One-vs-Rest ROC curves',
                 sz=10):
        class_names = ['none', 'antagonism', 'synergy']
        colors = cycle(['#808080', '#FFCC33', '#009999'])
        n_classes = 3
        if figdir is not None and fname is not None:
            with PdfPages(figdir + fname + '.pdf') as pdf:
                for cl in list(self.auc.keys()):
                    plt.figure(figsize=(sz, sz))
                    for i, color in zip(range(n_classes), colors):
                        plt.plot(
                            self.fpr[cl][i],
                            self.tpr[cl][i],
                            color=color,
                            lw=2,
                            label='ROC curve of class {0} (area = {1:0.2f})'
                            ''.format(class_names[i], self.auc[cl][i]))

                    plt.plot([0, 1], [0, 1], 'k--', lw=2)
                    plt.xlim([0.0, 1.0])
                    plt.ylim([0.0, 1.05])
                    plt.xlabel('False Positive Rate')
                    plt.ylabel('True Positive Rate')
                    plt.title(title + cl)
                    plt.legend(loc="lower right")
                    pdf.savefig()
                    plt.close()

    def plot_precision(self,
                       figdir=None,
                       fname=None,
                       title='One-vs-Rest Precision-Recall',
                       sz=10):
        class_names = ['none', 'antagonism', 'synergy']
        colors = cycle(['#808080', '#FFCC33', '#009999'])
        n_classes = 3
        if figdir is not None and fname is not None:
            with PdfPages(figdir + fname + '.pdf') as pdf:
                for cl in list(self.avprec.keys()):
                    plt.figure(figsize=(sz, sz))
                    f_scores = np.linspace(0.2, 0.8, num=4)

                    for f_score in f_scores:
                        x = np.linspace(0.01, 1)
                        y_ = f_score * x / (2 * x - f_score)
                        plt.plot(x[y_ >= 0],
                                 y_[y_ >= 0],
                                 color='gray',
                                 alpha=0.2,
                                 label='iso-F1 curves')
                        plt.annotate('f1={0:0.1f}'.format(f_score),
                                     xy=(0.9, y_[45] + 0.02))
                    for i, color in zip(range(n_classes), colors):
                        plt.plot(
                            self.recall[cl][i],
                            self.precision[cl][i],
                            color=color,
                            lw=2,
                            label=
                            'Precision-recall of class {0} (area = {1:0.2f})'
                            ''.format(class_names[i], self.avprec[cl][i]))

                    plt.xlim([0.0, 1.0])
                    plt.ylim([0.0, 1.05])
                    plt.xlabel('Recall')
                    plt.ylabel('Precision')
                    plt.title(title + cl)
                    plt.legend(loc="lower right")
                    pdf.savefig()
                    plt.close()

    def save_metrics(self, outdir=None, fname=None):
        auc_df = self.aggregate_auc()
        ap_df = self.aggregate_precision()
        metrics = pd.merge(auc_df, ap_df, on='cvfold', how='inner')

        if outdir is not None and fname is not None:
            metrics.to_csv(outdir + fname + '.tsv', sep="\t", index=False)
def run_classifier(clf,features,cases,bottom_inds,optimize_hyperparams=False):
    clf_name = clf.__class__.__name__
    cases = np.array(cases)
    # Set up the cross_validation study
    if clf_name == 'CollaborativeFilter':
        cases = np.array(preprocess_recommendations(cases))
        cy = [c[2] for c in cases]
        cases = np.array(cases)
        m_ind=cases[:,1]
    else:
        cy = cases[:,0]        
        
    # Pre-Run Hyperparameter Optimization
    if optimize_hyperparams:
        param_dist = hyper_params[clf_name]
    else:
        opt_param_dist = optimal_params[clf_name]
    num_iterations = 1 if optimize_hyperparams else 100
    shuffle = cross_validation.StratifiedShuffleSplit(y=cy,
                                                      n_iter=num_iterations, 
                                                      test_size=0.1,
                                                      random_state=None)
    scores =[]; Y_pred = []; Y_true = []; m_test_inds=[]
    # Run study
    for i,(train_index, test_index) in enumerate(shuffle):
        # Separate training/test set
        if (i%10)==0:
            print '  CV#%d of %d...'%(i,num_iterations)
        Y_train, Y_test = (cases[train_index],cases[test_index])
        
        # Fit and predict using the models
        if clf_name == 'CollaborativeFilter':
            # Split the training data into X and y vectors
            #Y_train = rebalance_cases(Y_train)
            X_train = Y_train[:,:-1]
            Y_train = Y_train[:,-1]
            X_test = Y_test[:,:-1]
            Y_test = Y_test[:,-1]
            m_test_ind = m_ind[test_index]
            m_test_inds.append(m_test_ind)
            
            if optimize_hyperparams:
                # Run Parameter Search
                n_iter_search = 2
                random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                                   n_iter=n_iter_search,
                                                   scoring='average_precision',
                                                   n_jobs=3,
                                                   refit=True,
                                                   cv=4,
                                                   verbose=1
                                                   )
                start = time()
                random_search.fit(X_train,Y_train)
                print("RandomizedSearchCV took %.2f minutes for %d candidates"
                      " parameter settings." % ((time() - start)/60.0, n_iter_search))
                opt_report(random_search.grid_scores_,n_top=10)
                Y_hat=random_search.best_estimator_.predict_proba(X_test)
            else:
                clf.set_params(opt_param_dist)
                clf.fit(X_train,Y_train)
                Y_hat=clf.predict_proba(X_test)
        else:
            X_train, X_test = (features[train_index],features[test_index])
            ovr = OneVsRestClassifier(clf)
            if optimize_hyperparams:
                n_iter_search = 400
                random_search = RandomizedSearchCV(ovr, param_distributions=param_dist,
                                                   n_iter=n_iter_search,
                                                   # Average precisions scoring
                                                   # doesn't seem to work in
                                                   # multi-label case
                                                   #scoring='average_precision',
                                                   scoring='log_loss',
                                                   n_jobs=3,
                                                   refit=True,
                                                   cv=4,
                                                   verbose=1
                                                   )
                start = time()
                random_search.fit(X_train,Y_train)
                print("RandomizedSearchCV took %.2f minutes for %d candidates"
                      " parameter settings." % ((time() - start)/60.0, n_iter_search))
                opt_report(random_search.grid_scores_,n_top=10)
                #clf = random_search.best_estimator_
                Y_hat=random_search.best_estimator_.predict_proba(X_test)
            else:
                ovr.set_params(**opt_param_dist)
                ovr.fit(X_train,Y_train)
                Y_hat = ovr.predict_proba(X_test)
            #Y_hat = clf.predict_proba(X_test)
        # Collect the results
        Y_pred.append(Y_hat)
        Y_true.append(Y_test)
    Y_true=np.vstack(Y_true)
    Y_pred=np.vstack(Y_pred)
    
    
    # Now do the overall AUC scoring
    print 'Generating bootstrap samples...'
    A=np.vstack([Y_true.flatten(),Y_pred.flatten()])
    A=A.transpose()
    auc_scores=[]
    for j in range(1000):
        B=resample(A)
        auc_scores.append(average_precision_score(B[:,0], B[:,1]))
    auc_scores=np.array(auc_scores)
    # Now just test PR on the k least popular methods
    if re.search('CollaborativeFilter',clf_name):
        m_test_inds=np.vstack(m_test_inds)
        m_test_ind = m_test_inds.flatten()
        ix=np.in1d(m_test_ind.ravel(), bottom_inds).reshape(m_test_ind.shape)
        A = np.vstack([Y_true.flatten()[ix],Y_pred.flatten()[ix]])
    else:
        A=np.vstack([Y_true[:,bottom_inds].flatten(),Y_pred[:,bottom_inds].flatten()])
    A=A.transpose()
    bottom_k_auc_scores=[]
    for j in range(1000):
        B=resample(A)
        bottom_k_auc_scores.append(average_precision_score(B[:,0], B[:,1]))
    bottom_k_auc_scores=np.array(bottom_k_auc_scores)
    
    return Y_pred,Y_true, auc_scores, bottom_k_auc_scores
Ejemplo n.º 9
0
class MeshTfidfSVM:
    def __init__(
        self, y_batch_size=256, nb_labels=None, model_path=None, threshold=0.5
    ):
        """
        y_batch_size: int, default 256. Size of column batches for Y i.e. tags that each classifier will train on
        nb_labels: int, default None. Number of tags that will be trained.
        model_path: path, default None. Model path being used to save intermediate classifiers
        threshold: float, default 0.5. Threshold probability on top of which a tag is assigned

        Note that model_path needs to be provided as it is used to save
        intermediate classifiers trained to reduce memory usage.
        """
        self.y_batch_size = y_batch_size
        self.model_path = model_path
        self.nb_labels = None
        self.threshold = threshold

    def _init_vectorizer(self):
        self.vectorizer = TfidfVectorizer(
            stop_words="english", max_df=0.95, min_df=5, ngram_range=(1, 1)
        )

    def _init_classifier(self):
        self.classifier = OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2"))

    def set_params(self, **params):
        if not hasattr(self, "vectorizer"):
            self._init_vectorizer()
        if not hasattr(self, "classifier"):
            self._init_classifier()

        tfidf_params = get_params_for_component(params, "tfidf")
        svm_params = get_params_for_component(params, "svm")
        self.vectorizer.set_params(**tfidf_params)
        self.classifier.set_params(**svm_params)
        # TODO: Create function that checks in params for arguments available in init
        if "model_path" in params:
            self.model_path = params["model_path"]
        if "y_batch_size" in params:
            self.y_batch_size = params["y_batch_size"]
        if "nb_labels" in params:
            self.nb_labels = params["nb_labels"]

    def fit(self, X, Y):
        """
        X: list of texts
        Y: sparse csr_matrix of tags assigned
        """
        if not hasattr(self, "vectorizer"):
            self._init_vectorizer()
        if not hasattr(self, "classifier"):
            self._init_classifier()

        # TODO: Currently Y is expected to be sparse, otherwise predict does not
        # work, add a check and warn user.

        print(f"Creating {self.model_path}")
        Path(self.model_path).mkdir(exist_ok=True)
        print("Fitting vectorizer")
        self.vectorizer.fit(X)
        with open(f"{self.model_path}/vectorizer.pkl", "wb") as f:
            f.write(pickle.dumps(self.vectorizer))
        print("Training model")
        self.nb_labels = Y.shape[1]
        for tag_i in range(0, self.nb_labels, self.y_batch_size):
            print(tag_i)
            X_vec = self.vectorizer.transform(X)
            self.classifier.fit(X_vec, Y[:, tag_i : tag_i + self.y_batch_size])

            # TODO: Sparsify weights before saving
            with open(f"{self.model_path}/{tag_i}.pkl", "wb") as f:
                f.write(pickle.dumps(self.classifier))

        return self

    def predict(self, X):
        return self.predict_proba(X) > self.threshold

    def predict_proba(self, X):
        Y_pred_proba = []
        for tag_i in range(0, self.nb_labels, self.y_batch_size):
            with open(f"{self.model_path}/{tag_i}.pkl", "rb") as f:
                classifier = pickle.loads(f.read())
            X_vec = self.vectorizer.transform(X)

            Y_pred_proba_batch = classifier.predict_proba(X_vec)
            Y_pred_proba.append(Y_pred_proba_batch)

        Y_pred_proba = np.hstack(Y_pred_proba)
        return Y_pred_proba

    def save(self, model_path):
        if model_path != self.model_path:
            print(
                f"{model_path} is different from self.model_path {self.model_path}. This will result in model and meta.json be saved in different paths"
            )

        meta = {
            "name": "MeshTfidfSVM",
            "approach": "mesh-tfidf-svm",
            "y_batch_size": self.y_batch_size,
            "nb_labels": self.nb_labels,
        }
        meta_path = os.path.join(model_path, "meta.json")
        with open(meta_path, "w") as f:
            f.write(json.dumps(meta))

    def load(self, model_path):
        vectorizer_path = os.path.join(model_path, "vectorizer.pkl")
        with open(vectorizer_path, "rb") as f:
            self.vectorizer = pickle.loads(f.read())

        meta_path = os.path.join(model_path, "meta.json")
        with open(meta_path, "r") as f:
            meta = json.loads(f.read())
        self.set_params(**meta)

        self.model_path = model_path