Beispiel #1
0
def SVM(X, y, X_ind, y_ind, is_reg=False):
    """Cross Validation and independent set test for Support Vector Machine (SVM)

    Arguments:
        X (ndarray): Feature data of training and validation set for cross-validation.
                     m X n matrix, m is the No. of samples, n is the No. of fetures
        y (ndarray): Label data of training and validation set for cross-validation.
                     m-D vector, and m is the No. of samples.
        X_ind (ndarray): Feature data of independent test set for independent test.
                         It has the similar data structure as X.
        y_ind (ndarray): Feature data of independent set for for independent test.
                         It has the similar data structure as y
        out (str): The file path for saving the result data.
        is_reg (bool, optional): define the model for regression (True) or classification (False) (Default: False)

    Returns:
         cvs (ndarray): cross-validation results. The shape is (m, ), m is the No. of samples.
         inds (ndarray): independent test results. It has similar data structure as cvs.
    """
    if is_reg:
        folds = KFold(5).split(X)
        model = SVR()
    else:
        folds = StratifiedKFold(5).split(X, y)
        model = SVC(probability=True)
    cvs = np.zeros(y.shape)
    inds = np.zeros(y_ind.shape)
    gs = GridSearchCV(model, {
        'C': 2.0**np.array([-5, 15]),
        'gamma': 2.0**np.array([-15, 5])
    },
                      n_jobs=5)
    gs.fit(X, y)
    params = gs.best_params_
    print(params)
    for i, (trained, valided) in enumerate(folds):
        model = SVC(probability=True, C=params['C'], gamma=params['gamma'])
        model.fit(X[trained], y[trained])
        if is_reg:
            cvs[valided] = model.predict(X[valided])
            inds += model.predict(X_ind)
        else:
            cvs[valided] = model.predict_proba(X[valided])[:, 1]
            inds += model.predict_proba(X_ind)[:, 1]
    return cvs, inds / 5
Beispiel #2
0
class MySVM:
    def __init__(self, name, data, problem_type, load_from_file=False):
        self.name = name
        self._unpack_data(data)
        self.problem_type = problem_type

        if load_from_file:
            self._load_model(self.name)
        else:
            if problem_type == 'classification':
                self.model = SVC(probability=True)
            else:
                self.model = SVR()

    def find_best_model(self, param_grid, save=False):
        search = GridSearchCV(self.model,
                              param_grid=param_grid,
                              cv=10,
                              verbose=1,
                              n_jobs=-1)
        search.fit(self.X_train, self.y_train)

        print(search.best_params_)
        self.model = search.best_estimator_

        if save:
            print("saving model")
            dump(self.model, f'models/{self.name}.joblib')

        return search

    def train(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        if self.problem_type == "classification":
            return self.model.predict_proba(X)
        elif self.problem_type == "regression":
            return self.model.predict(X).reshape(-1, 1)

    def _unpack_data(self, data):
        self.X_train = data[0]
        self.y_train = data[1]
        self.X_test = data[2]
        self.y_test = data[3]

    def _load_model(self, name):
        self.model = load(f'models/{name}.joblib')
Beispiel #3
0
ynew = model.predict(Xnew)
print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))

# In[ ]:

from sklearn.linear_model import LogisticRegression
from sklearn.datasets.samples_generator import make_blobs
# generate 2d classification dataset
X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)
# fit final model
model = LogisticRegression()
model.fit(X, y)
# new instances where we do not know the answer
Xnew, _ = make_blobs(n_samples=3, centers=2, n_features=2, random_state=1)
# make a prediction
ynew = model.predict_proba(Xnew)
# show the inputs and predicted probabilities
for i in range(len(Xnew)):
    print("X=%s, Predicted=%s" % (Xnew[i], ynew[i]))

# In[15]:

#creating model
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

dataframe = pandas.read_csv(
    "/home/seethalprince/cdc/CDC_Intern/Dataset/Data1.csv")
array = dataframe.values
Beispiel #4
0
def cv(dataset, summary_df, cddd_model_dir, molbert_model_dir):
    df, indices = get_data(dataset)
    cddd = InferenceModel(cddd_model_dir)  # type: ignore
    molbert = MolBertFeaturizer(molbert_model_dir,
                                embedding_type='average-1-cat-pooled',
                                max_seq_len=200,
                                device='cpu')  # type: ignore
    ecfp = MorganFPFeaturizer(fp_size=2048,
                              radius=2,
                              use_counts=True,
                              use_features=False)
    rdkit_norm = PhysChemFeaturizer(normalise=True)

    cddd_fn = lambda smiles: cddd.seq_to_emb(smiles)
    molbert_fn = lambda smiles: molbert.transform(smiles)[0]
    ecfp_fn = lambda smiles: ecfp.transform(smiles)[0]
    rdkit_norm_fn = lambda smiles: rdkit_norm.transform(smiles)[0]

    for i, (train_idx, valid_idx, test_idx) in enumerate(indices):
        train_df = df.iloc[train_idx]
        valid_df = df.iloc[valid_idx]

        # combine train and valid set as SVMs don't use a validation set, but NNs do.
        # this way they use the same amount of data.
        train_df = pd.concat([train_df, valid_df])
        test_df = df.iloc[test_idx]

        fn_combos = [('cddd', cddd_fn), ('molbert', molbert_fn),
                     ('ECFP4', ecfp_fn), ('rdkit_norm', rdkit_norm_fn)]

        for feat_name, feat_fn in fn_combos:
            train_features = np.vstack([
                feat_fn(batch) for batch in batchify(train_df['SMILES'], 256)
            ])
            train_labels = train_df[df.columns[-1]]

            test_features = np.vstack(
                [feat_fn(batch) for batch in batchify(test_df['SMILES'], 256)])
            test_labels = test_df[df.columns[-1]]

            mode = summary_df[summary_df['task_name'] ==
                              dataset].iloc[0]['task_type'].strip()

            np.random.seed(i)
            if mode == 'regression':
                model = SVR(C=5.0)
            elif mode == 'classification':
                model = SVC(5.0, probability=True)
            else:
                raise ValueError(
                    f'Mode has to be either classification or regression but was {mode}.'
                )

            model.fit(train_features, train_labels)

            predictions = model.predict(test_features)

            if mode == 'classification':
                # predict probabilities (needed for some metrics) and get probs of positive class ([:, 1])
                prob_predictions = model.predict_proba(test_features)[:, 1]
                metrics_dict = {
                    'AUROC':
                    lambda: metrics.roc_auc_score(test_labels, prob_predictions
                                                  ),
                    'AveragePrecision':
                    lambda: metrics.average_precision_score(
                        test_labels, prob_predictions),
                    'Accuracy':
                    lambda: metrics.accuracy_score(test_labels, predictions),
                }
            else:
                metrics_dict = {
                    'MAE':
                    lambda: metrics.mean_absolute_error(
                        test_labels, predictions),
                    'RMSE':
                    lambda: np.sqrt(
                        metrics.mean_squared_error(test_labels, predictions)),
                    'MSE':
                    lambda: metrics.mean_squared_error(test_labels, predictions
                                                       ),
                    'R2':
                    lambda: metrics.r2_score(test_labels, predictions),
                }

            metric_values = {}
            for name, callable_metric in metrics_dict.items():
                try:
                    metric_values[name] = callable_metric()
                except Exception as e:
                    print(f'unable to calculate {name} metric')
                    print(e)
                    metric_values[name] = np.nan

            default_path = os.path.join(
                './logs/',
                datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ'))
            output_dir = os.path.join(default_path, dataset, str(i))
            os.makedirs(output_dir, exist_ok=True)
            with open(os.path.join(output_dir, f'{feat_name}_metrics.json'),
                      'w+') as fp:
                json.dump(metric_values, fp)
Beispiel #5
0
class Log_reg():
    def __init__(self):
        self.log_reg = SVR(kernel='rbf',
                           degree=3,
                           gamma='auto',
                           coef0=0.0,
                           tol=1e-3,
                           C=1.0,
                           epsilon=0.1,
                           shrinking=True,
                           cache_size=200,
                           verbose=False,
                           max_iter=-1)

        self.ss = StandardScaler()

    def read_features_csv(self, file, My_Model):
        # 需要注意第一行数据未读取
        df = pd.read_csv(file)
        columns_size = df.columns.size
        if My_Model == 0:
            X = df.iloc[:, 1:columns_size - args.k - 2]
        else:
            X = pd.concat([
                df.iloc[:, 1:columns_size - 2 * args.k - 2],
                df.iloc[:, columns_size - args.k - 2:-2]
            ],
                          axis=1)
        Y = df.iloc[:, -2]
        #        print(X.head())
        #        print(X,Y)
        return X, Y

    #    print(df.head())

    def fit_logistic_regression(self, train_file):
        x, y = self.read_features_csv(train_file, args.myModel)
        #    ss = StandardScaler()

        x = self.ss.fit_transform(x)

        #        print(x,y)
        #    log_reg = LogisticRegression()
        self.log_reg.fit(x, y)

    #    y_pre = log_reg.predict_proba(x)
    #        return log_reg

    def predict_test(self, test_file):
        x, y = self.read_features_csv(test_file, args.myModel)
        x = self.ss.fit_transform(x)
        y_p_pre = self.log_reg.predict_proba(x)
        y_pre = self.log_reg.predict(x)
        accuracy = accuracy_score(y, y_pre)
        precision = precision_score(y, y_pre)
        recall = recall_score(y, y_pre)
        F1 = f1_score(y, y_pre)
        scores = self.log_reg.score(x, y)
        print("accuracy, precision, recall, F1:", accuracy, precision, recall,
              F1)
        print("scores:", scores)
        return y_p_pre, y