Beispiel #1
0
    def predict(self, X):
        y_preds = []
        X_copy = X.copy()
        X_copy.index = range(len(X_copy))
        ecfp_transformer = ECFPEncoder(radius=self.radius,
                                       dim=self.ecfp_dim,
                                       sparse_output=True)
        sparse_ecfp = ecfp_transformer.transform(X_copy)
        for i, row in X_copy.iterrows():
            # If the target is known
            target_id = row['target_id']
            if target_id in self.store:
                known_ecfps = self.store[target_id][0]
                sim = cosine_similarity(known_ecfps, sparse_ecfp[i])[:, 0]
                sorted_indexes = np.argsort(sim, axis=0)
                ys = self.store[target_id][1][sorted_indexes[-self.k:]]
                if self.weights == 'uniform':
                    y_preds.append(np.mean(ys))
                elif self.weights == 'average':
                    y_preds.append(
                        np.average(ys, weights=sim[sorted_indexes[-self.k:]]))
            else:
                # If the target is unknown
                ecfp = ecfp_transformer.transform(row.to_frame().T)
                sim = cosine_similarity(ecfp, self.full_ecfp)[0]
                y_preds.append(self.full_y[np.argmax(sim)])

        return np.array(y_preds).reshape((len(y_preds), 1))
Beispiel #2
0
    def fit(self, X, y):
        y = y.reshape(-1)
        X.index = range(len(X))
        self.store = {}
        ecfp_transformer = ECFPEncoder(radius=self.radius,
                                       dim=self.ecfp_dim,
                                       sparse_output=True)
        for target_id, group in X.groupby('target_id'):
            self.store[target_id] = (ecfp_transformer.fit_transform(group),
                                     y[group.index])

        self.full_ecfp = ecfp_transformer.transform(X)
        self.full_y = y

        return self
Beispiel #3
0
class TestECFPEncoder(unittest.TestCase):
    transformer = ECFPEncoder(radius=4)

    def get_X(self):
        return pd.DataFrame([
            ["InChI=1S/CO2/c2-1-3"],
            [
                "InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-10(12)13/h2-6,11H,1H3,(H,12,13)/b5-3+"
            ]
        ],
                            columns=['standard_inchi'])

    def test_transform(self):
        X = self.get_X()
        X_transformed = self.transformer.fit_transform(X)
        pd.testing.assert_frame_equal(
            X_transformed,
            pd.DataFrame([
                ["InChI=1S/CO2/c2-1-3", [633848, 899457, 899746, 916106]],
                [
                    "InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-10(12)13/h2-6,11H,1H3,(H,12,13)/b5-3+",
                    [
                        1773, 9728, 20034, 57369, 57588, 78979, 88049, 95516,
                        107971, 123721, 134214, 167638, 204359, 349540, 356383,
                        378749, 390288, 397092, 431546, 435051, 439248, 459409,
                        495384, 515018, 528633, 529834, 547430, 614225, 624875,
                        635687, 647863, 650023, 650051, 654006, 678945, 726962,
                        830972, 846213, 874176, 911985, 916106, 923641, 942272
                    ]
                ]
            ],
                         columns=['standard_inchi', 'ecfp_encoding']))

    def test_transform_with_sparse_output(self):
        X = self.get_X()
        transformer = ECFPEncoder(radius=4, sparse_output=True)
        Xt = transformer.fit_transform(X)
        expected_nonzeros = (np.array([
            0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1
        ],
                                      dtype=np.int32),
                             np.array([
                                 633848, 899457, 899746, 916106, 1773, 9728,
                                 20034, 57369, 57588, 78979, 88049, 95516,
                                 107971, 123721, 134214, 167638, 204359,
                                 349540, 356383, 378749, 390288, 397092,
                                 431546, 435051, 439248, 459409, 495384,
                                 515018, 528633, 529834, 547430, 614225,
                                 624875, 635687, 647863, 650023, 650051,
                                 654006, 678945, 726962, 830972, 846213,
                                 874176, 911985, 916106, 923641, 942272
                             ],
                                      dtype=np.int32))
        for i, elem in enumerate(Xt.nonzero()):
            np.testing.assert_array_equal(expected_nonzeros[i], elem)
Beispiel #4
0
 def get_steps(self,
               kmer_size=3,
               radius=2,
               ecfp_dim=2**10,
               embedding_dim=10,
               lr=0.1,
               max_epochs=5,
               device=None,
               train_split=None,
               optimizer=SGD,
               weight_decay=0,
               dropout=0):
     """
     This pipeline is a neural net baseline using sparsed input fingerprints for both the compound (ecfp) and the
     enzyme (k-mers).
     :param kmer_size: The k-mer size used for the enzyme's descriptor
     :param radius: The radius used in ecfp
     :param ecfp_dim: The dimension of the byte space used by ecfp algorithm
     :param embedding_dim: Both enzyme and compounds are embedded in the neural net in a space of the same size
     :param lr: the neural net base learning rate
     :param max_epochs: Maximum number of epochs to run
     :param device: The device on which computation will take place
     :param train_split: if None, no internal cross validation is made, else a skorch CVSplit object
     :return: sklearn.pipeline.Pipeline
     """
     kmers_counter = KmersCounter(kmer_size=kmer_size)
     num_kmers = NB_AMINO_ACID**kmer_size
     collate_fn = partial(collate_to_sparse_tensors,
                          protein_input_size=num_kmers,
                          compound_input_size=ecfp_dim,
                          device=torch.device(device))
     net = NeuralNetRegressor(module=Baseline,
                              module__num_kmers=num_kmers,
                              module__num_fingerprints=ecfp_dim,
                              module__embedding_dim=embedding_dim,
                              module__dropout=dropout,
                              max_epochs=max_epochs,
                              lr=lr,
                              optimizer=optimizer,
                              optimizer__weight_decay=weight_decay,
                              device=device,
                              iterator_train__collate_fn=collate_fn,
                              iterator_train__shuffle=True,
                              iterator_valid__collate_fn=collate_fn,
                              train_split=train_split)
     return [('encode_proteins', kmers_counter),
             ('encode_ecfp', ECFPEncoder(radius=radius, dim=ecfp_dim)),
             ('to_dict',
              DfToDict({
                  'protein_input': 'kmers_counts',
                  'compound_input': 'ecfp_encoding'
              })), ('baseline_net', net)]
Beispiel #5
0
 def test_transform_with_sparse_output(self):
     X = self.get_X()
     transformer = ECFPEncoder(radius=4, sparse_output=True)
     Xt = transformer.fit_transform(X)
     expected_nonzeros = (np.array([
         0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1
     ],
                                   dtype=np.int32),
                          np.array([
                              633848, 899457, 899746, 916106, 1773, 9728,
                              20034, 57369, 57588, 78979, 88049, 95516,
                              107971, 123721, 134214, 167638, 204359,
                              349540, 356383, 378749, 390288, 397092,
                              431546, 435051, 439248, 459409, 495384,
                              515018, 528633, 529834, 547430, 614225,
                              624875, 635687, 647863, 650023, 650051,
                              654006, 678945, 726962, 830972, 846213,
                              874176, 911985, 916106, 923641, 942272
                          ],
                                   dtype=np.int32))
     for i, elem in enumerate(Xt.nonzero()):
         np.testing.assert_array_equal(expected_nonzeros[i], elem)
Beispiel #6
0
 def get_steps(self,
               kmer_size=3,
               radius=2,
               ecfp_dim=2**10,
               alpha=0,
               device=None):
     return [('sparse_encoding',
              FeatureUnion(n_jobs=-1,
                           transformer_list=[
                               ('encode_proteins',
                                KmersCounter(kmer_size=kmer_size,
                                             sparse_output=True)),
                               ('encode_ecfp',
                                ECFPEncoder(radius=radius,
                                            dim=ecfp_dim,
                                            sparse_output=True))
                           ])), ('linear_regression', Ridge(alpha=alpha))]
Beispiel #7
0
    def get_steps(self,
                  kmer_size=3,
                  radius=2,
                  ecfp_dim=2**20,
                  hidden_size=10,
                  mlp_sizes=(10, ),
                  embedding_dim=10,
                  max_epochs=10,
                  lr=1,
                  optimizer=SGD,
                  device=None,
                  train_split=None,
                  weight_decay=0,
                  lstm_dropout=0):
        """
        :param kmer_size:
        :param radius:
        :param ecfp_dim:
        :param hidden_size:
        :param mlp_sizes:
        :param embedding_dim:
        :param max_epochs:
        :param lr:
        :param optimizer:
        :param device:
        :param train_split:
        :param weight_decay:
        :param lstm_dropout:
        :return:
        """

        collate_fn = partial(collate_bilstm_fingerprint,
                             device=torch.device(device),
                             ecfp_dim=ecfp_dim)
        kmers_encoder = KmerEncoder(kmer_size=kmer_size, pad=True)
        net = NeuralNetRegressor(module=SiameseBiLSTMFingerprints,
                                 module__num_kmers=kmers_encoder.dim + 1,
                                 module__num_fingerprints=ecfp_dim,
                                 module__embedding_dim=embedding_dim,
                                 module__hidden_size=hidden_size,
                                 module__mlp_sizes=mlp_sizes,
                                 module__lstm_dropout=lstm_dropout,
                                 max_epochs=max_epochs,
                                 lr=lr,
                                 optimizer=optimizer,
                                 optimizer__weight_decay=weight_decay,
                                 device=device,
                                 iterator_train__shuffle=True,
                                 iterator_train__collate_fn=collate_fn,
                                 iterator_valid__collate_fn=collate_fn,
                                 train_split=train_split)
        return [('encode_proteins', kmers_encoder),
                ('encode_ecfp',
                 ECFPEncoder(radius=radius, dim=ecfp_dim,
                             sparse_output=False)),
                ('to_dict',
                 DfToDict({
                     'protein_input': 'kmers_encoding',
                     'compound_input': 'ecfp_encoding',
                     'protein_lengths': 'encoding_len'
                 })), ('bilstm_fingerprint', net)]