def test_simple_numeric_predictor(): predictor = Predictor(inputs=[NumericInput(dim=30)], outputs=[Output(dim=1, activation="sigmoid")], dense_layer_sizes=[30], dense_activation="relu") y = predictor.predict(randn(10, 30)) eq_(len(y), 10)
def test_fixed_length_hotshot(): model = Predictor(inputs=SequenceInput(length=9, variable_length=False, encoding="onehot"), outputs=Output(1, activation="sigmoid")) seqs = ["A" * 9, "L" * 9] y = model.predict(seqs) eq_(len(y), 2)
def test_simple_numeric_predictor_named(): predictor = Predictor( inputs=[NumericInput(name="x", dim=30)], outputs=[Output(dim=1, name="y", activation="sigmoid")], hidden_layer_sizes=[30], hidden_activation="relu") y = predictor.predict({"x": randn(10, 30)})["y"] eq_(len(y), 10)
def test_simple_sequence_predictor_named(): predictor = Predictor( inputs=[SequenceInput(length=4, name="x", variable_length=True)], outputs=[Output(dim=1, activation="sigmoid", name="y")], hidden_layer_sizes=[30], hidden_activation="relu") y = predictor.predict({"x": ["SFY-"] * 10})["y"] eq_(len(y), 10)
def test_simple_sequence_predictor(): predictor = Predictor( inputs=[SequenceInput(length=4, variable_length=True)], outputs=[Output(dim=1, activation="sigmoid")], dense_layer_sizes=[30], dense_activation="relu") y = predictor.predict(["SFY-"] * 10) eq_(len(y), 10)
def test_embedding_conv_1_layer(): model = Predictor(inputs=SequenceInput(length=3, variable_length=False, conv_filter_sizes={2: 3}), outputs=Output(1, activation="sigmoid")) X = ["SAY", "FFQ"] Y = np.array([True, False]) model.fit(X, Y)
def test_two_input_predictor(): predictor = Predictor( inputs=[ SequenceInput(length=4, name="x1", variable_length=True), NumericInput(dim=30, name="x2") ], outputs=[Output(name="y", dim=1, activation="sigmoid")], dense_layer_sizes=[30], dense_activation="relu") y = predictor.predict({"x1": ["SFY-"] * 10, "x2": randn(10, 30)})["y"] eq_(len(y), 10)
def test_basic_rnn(): pred = Predictor(inputs=SequenceInput(name="x", length=4, variable_length=True, encoding="embedding", rnn_layer_sizes=[20], rnn_type="lstm", rnn_bidirectional=True), outputs=Output(dim=1, activation="sigmoid", name="y")) x = ["SF", "Y", "AALL"] y = pred.predict({"x": x})["y"] eq_(len(x), len(y)) found_rnn_layer = any("bidirectional" in layer.name for layer in pred.model.layers) assert found_rnn_layer
def test_predictor_output_transform(): predictor = Predictor(inputs=[NumericInput(dim=30, name="x")], outputs=[ Output(name="y", dim=1, activation="sigmoid", transform=log, inverse_transform=exp) ], dense_layer_sizes=[30], dense_activation="relu") y = predictor.predict({"x": randn(10, 30)})["y"] eq_(len(y), 10) # make sure transformed outputs are within given bounds assert exp(0.0) <= y.min() <= exp(1.0) assert exp(0.0) <= y.max() <= exp(1.0)
def make_predictors(widths=[9], layer_sizes=[8], n_conv_layers=[2], conv_dropouts=[0.25], conv_activation="relu", global_pooling_batch_normalization=True): return {( width, layer_size, n_layers, dropout ): Predictor(inputs=SequenceInput( name="peptide", length=45, add_start_tokens=True, add_stop_tokens=True, variable_length=True, conv_filter_sizes=[1, 3, 5, 7, width, 11 ], n_conv_layers=n_layers, conv_output_dim=layer_size, conv_activation=conv_activation, conv_dropout=dropout, global_pooling=True, global_pooling_batch_normalization=global_pooling_batch_normalization), outputs=Output(1, activation="sigmoid")) for width in widths for layer_size in layer_sizes for n_layers in n_conv_layers for dropout in conv_dropouts}
def _make_nn_model(self, n_features, dense_layer_sizes=[20], batch_normalization=True): return Predictor(inputs=NumericInput( n_features, dense_layer_sizes=dense_layer_sizes, dense_batch_normalization=batch_normalization), outputs=Output(1, activation="sigmoid"))
def from_dict(cls, d): # importing locally to avoid adding a significant delay to # the import of every module from pepnet import Predictor model_dicts = d["models"] model_weights = d.get("model_weights") models = [Predictor.from_dict(d) for d in model_dicts] return cls(models=models, model_weights=model_weights)
def test_predictor_on_more_data(): predictor = Predictor( inputs=[SequenceInput(length=20, name="x", variable_length=True)], outputs=[Output(dim=1, activation="sigmoid", name="y")], dense_layer_sizes=[30], dense_activation="relu") train_df = synthetic_peptides_by_subsequence(1000) test_df = synthetic_peptides_by_subsequence(1000) predictor.fit({"x": train_df.index.values}, train_df.binder.values, epochs=20) y_pred = predictor.predict({"x": test_df.index.values})['y'] y_pred = pandas.Series(y_pred, index=test_df.index) binder_mean_pred = y_pred[test_df.binder].mean() nonbinder_mean_pred = y_pred[~test_df.binder].mean() print(binder_mean_pred, nonbinder_mean_pred) assert binder_mean_pred > nonbinder_mean_pred * 2, (binder_mean_pred, nonbinder_mean_pred)
def test_model_with_fixed_length_context(): model = Predictor(inputs={ "upstream": SequenceInput(length=1, variable_length=False), "downstream": SequenceInput(length=1, variable_length=False), "peptide": SequenceInput(length=3, variable_length=True) }, outputs=Output(1, activation="sigmoid")) Y = np.array([True, False, True, False]) input_dict = { "upstream": ["Q", "A", "L", "I"], "downstream": ["S"] * 4, "peptide": ["SYF", "QQ", "C", "GLL"] } model.fit(input_dict, Y, epochs=20) Y_pred = model.predict(input_dict) assert (Y == (Y_pred > 0.5)).all(), (Y, Y_pred)
def make_model(sufficiently_large_output_names): mhc = SequenceInput(length=34, name="mhc", encoding="index", variable_length=True, embedding_dim=20, embedding_mask_zero=False, dense_layer_sizes=[64], dense_batch_normalization=True) peptide = SequenceInput( length=50, name="peptide", encoding="index", embedding_dim=20, embedding_mask_zero=True, variable_length=True, conv_filter_sizes=[1, 9, 10], conv_activation="relu", conv_output_dim=32, n_conv_layers=2, # conv_weight_source=mhc, global_pooling=True, global_pooling_batch_normalization=True) outputs = [] for output_name in sufficiently_large_output_names: if "IC50" in output_name or "EC50" in output_name: transform = from_ic50 inverse = to_ic50 activation = "sigmoid" elif "half life" in output_name: transform = (lambda x: np.log10(x + 1)) inverse = (lambda x: (10.0**x) - 1) activation = "relu" else: transform = None inverse = None activation = "sigmoid" output = Output(name=output_name, transform=transform, inverse_transform=inverse, activation=activation) print(output) outputs.append(output) return Predictor(inputs=[mhc, peptide], outputs=outputs, merge_mode="concat", dense_layer_sizes=[32], dense_activation="tanh", dense_batch_normalization=True)
def test_predictor_weights_all_ones(): predictor = Predictor(inputs=[ SequenceInput(length=2, variable_length=False, encoding="onehot") ], outputs=[Output(dim=1, activation="sigmoid")]) weights = predictor.get_weights() for w in weights: w.fill(1) predictor.set_weights(weights) predictor2 = Predictor.from_json(predictor.to_json()) for w in predictor2.get_weights(): assert (w == np.ones_like(w)).all(), "Expected %s to be all 1s" % (w, )
def make_predictors(widths=[9], layer_sizes=[16], n_conv_layers=[2], conv_dropouts=[0]): return {(width, layer_size, n_layers, dropout): Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, conv_filter_sizes=[1, 3, width], n_conv_layers=n_layers, conv_output_dim=layer_size, conv_dropout=dropout, global_pooling=True), outputs=Output(1, activation="sigmoid")) for width in widths for layer_size in layer_sizes for n_layers in n_conv_layers for dropout in conv_dropouts}
def make_predictors( widths=[8, 9, 10], layer_sizes=[4, 16, 32], n_conv_layers=[1, 2], conv_dropouts=[0, 0.25]): return { "width=%d, layer_size=%d, n_layers=%d, conv=%0.2f" % ( width, layer_size, n_layers, dropout): Predictor( inputs=SequenceInput( name="peptide", length=22, variable_length=True, conv_filter_sizes=[width], n_conv_layers=n_layers, conv_output_dim=layer_size, conv_dropout=dropout, global_pooling=True), outputs=Output(1, activation="sigmoid")) for width in widths for layer_size in layer_sizes for n_layers in n_conv_layers for dropout in conv_dropouts }
def make_predictor(conv_filter_sizes, conv_dropout=0.25, conv_activation="relu", global_pooling_batch_normalization=True, dense_layer_sizes=[], dense_dropout=0.25, dense_activation="relu"): return Predictor(inputs=SequenceInput( name="peptide", length=MAX_LENGTH, add_start_tokens=True, add_stop_tokens=True, variable_length=True, embedding_dim=24, conv_filter_sizes=conv_filter_sizes, conv_activation=conv_activation, conv_dropout=conv_dropout, global_pooling=True, global_pooling_batch_normalization=global_pooling_batch_normalization), outputs=Output(1, activation="sigmoid", dense_layer_sizes=dense_layer_sizes, dense_activation="relu", dense_dropout=dense_dropout))
def make_model(output_names): mhc = SequenceInput( length=34, name="mhc", encoding="index", variable_length=True, embedding_dim=32, embedding_mask_zero=False, dense_layer_sizes=[32], dense_activation="tanh", dense_batch_normalization=MERGE_BATCH_NORMALIZATION, dense_dropout=MERGE_DROPOUT) peptide = SequenceInput( length=45, name="peptide", encoding="index", add_start_tokens=True, add_stop_tokens=True, embedding_dim=32, embedding_mask_zero=True, variable_length=True, conv_filter_sizes=[9], conv_activation="relu", conv_output_dim=32, conv_dropout=CONV_DROPOUT, conv_batch_normalization=CONV_BATCH_NORMALIZATION, n_conv_layers=2, # conv_weight_source=mhc, global_pooling=True, global_pooling_batch_normalization=True, global_pooling_dropout=0.25, dense_layer_sizes=[32], dense_activation="sigmoid", dense_batch_normalization=MERGE_BATCH_NORMALIZATION, dense_dropout=MERGE_DROPOUT) outputs = [] for output_name in output_names: if "IC50" in output_name or "EC50" in output_name: transform = from_ic50 inverse = to_ic50 activation = "sigmoid" elif "half life" in output_name: transform = (lambda x: np.log10(x + 1)) inverse = (lambda x: (10.0 ** x) - 1) activation = "relu" else: transform = None inverse = None activation = "sigmoid" output = Output( name=output_name, transform=transform, inverse_transform=inverse, activation=activation, loss=LOSS) print(output) outputs.append(output) return Predictor( inputs=[mhc, peptide], outputs=outputs, merge_mode=MERGE, training_metrics=["accuracy"])
def test_discrete_input_with_str_tokens(): pred = Predictor(inputs=DiscreteInput(choices=["x", "y", "z"], embedding_dim=2), outputs=Output(1, "sigmoid")) pred.fit(["x", "x", "y", "z"], [0, 0, 0.5, 1.0], epochs=20) assert pred.predict(["x"]) < pred.predict(["z"])
def make_predictors(): return { "pool": Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, global_pooling=True), outputs=Output(1, activation="sigmoid")), "rnn": Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, rnn_layer_sizes=[32]), outputs=Output(1, activation="sigmoid")), "rnn2": Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, rnn_layer_sizes=[32, 32]), outputs=Output(1, activation="sigmoid")), "conv-pool": Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, conv_filter_sizes=[9], conv_output_dim=16, conv_dropout=0.1, global_pooling=True), outputs=Output(1, activation="sigmoid")), "conv2-pool": Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, conv_filter_sizes=[9], n_conv_layers=2, conv_output_dim=16, conv_dropout=0.1, global_pooling=True), outputs=Output(1, activation="sigmoid")), "conv-rnn": Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, conv_filter_sizes=[9], conv_output_dim=16, conv_dropout=0.1, rnn_layer_sizes=[32]), outputs=Output(1, activation="sigmoid")), "multiconv-pool": Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, conv_filter_sizes=[3, 9], conv_output_dim=16, conv_dropout=0.1, global_pooling=True), outputs=Output(1, activation="sigmoid")), "multiconv2-pool": Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, conv_filter_sizes=[3, 9], n_conv_layers=2, conv_output_dim=16, conv_dropout=0.1, global_pooling=True), outputs=Output(1, activation="sigmoid")), "multiconv-rnn": Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, conv_filter_sizes=[3, 9], conv_output_dim=16, conv_dropout=0.1, rnn_layer_sizes=[32]), outputs=Output(1, activation="sigmoid")), "multiconv2-rnn": Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, conv_filter_sizes=[3, 9], n_conv_layers=2, conv_output_dim=16, conv_dropout=0.1, rnn_layer_sizes=[32]), outputs=Output(1, activation="sigmoid")), "multiconv2-rnn2": Predictor(inputs=SequenceInput(name="peptide", length=22, variable_length=True, conv_filter_sizes=[3, 9], n_conv_layers=2, conv_output_dim=16, conv_dropout=0.1, rnn_layer_sizes=[32, 32]), outputs=Output(1, activation="sigmoid")) }
def test_predictor_json_identity(): predictor = Predictor(inputs=[ SequenceInput(length=2, variable_length=False, encoding="onehot") ], outputs=[Output(dim=1, activation="sigmoid")]) eq_(predictor, Predictor.from_json(predictor.to_json()))