def test_vep_plugin(): gtf = 'tests/data/test.gtf' vcf = 'tests/data/test.vcf.gz' fasta = 'tests/data/hg19.nochr.chr17.fa' dl = SplicingVCFDataloader(gtf, fasta, vcf) model = MMSplice() df_python = predict_all_table(model, dl, pathogenicity=True, splicing_efficiency=True) df_python_predictionsMax = max_varEff(df_python).set_index('ID') df_plugin = read_vep(vep_output) df_plugin_predictionsMax = max_varEff(df_plugin).set_index('ID') indexes = list( set(df_plugin_predictionsMax.index) & set(df_python_predictionsMax.index)) vep_plugin_dlogitPsi = df_plugin_predictionsMax.loc[indexes, 'delta_logit_psi'] python_package = df_python_predictionsMax.loc[indexes, 'delta_logit_psi'] assert pearsonr(vep_plugin_dlogitPsi, python_package)[0] >= 0.95
def test_predict_all_table(vcf_path): model = MMSplice() dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path) df = predict_all_table(model, dl, pathogenicity=True, splicing_efficiency=True) assert len(df['delta_logit_psi']) == len(variants) - 1
def test_predict_all_table_exon_dataloader(vcf_path): model = MMSplice() df_exons = pd.read_csv(exon_file) dl = ExonDataset(exon_file, fasta_file) df = predict_all_table(model, dl, pathogenicity=True, splicing_efficiency=True) assert len(df['delta_logit_psi']) == df_exons.shape[0]
def test_exon_model_masking(): model = MMSplice() preds = [ model.exonM.predict(encodeDNA(['AAA']))[0][0], model.exonM.predict(encodeDNA(['AAA', 'CATACA']))[0][0], model.exonM.predict(encodeDNA(['AAA', 'CATACAGGAA']))[0][0] ] for i in preds: assert abs(preds[0] - i) < 1e-6
def run(): options = json.loads(sys.stdin.readline().strip()) K.clear_session() psi_model = MMSplice( **{k: v for k, v in options.items() if v}) psi_model.spliter = SeqSpliter(pattern_warning=False) # warms up the model psi_model.predict("A" * 100, (4, 4)) sys.stdout.write('MMSPLICE-RESPONSE:' + '1\n') sys.stdout.flush() while True: variant = json.loads(sys.stdin.readline().strip()) overhang = (variant['intronl_len'], variant['intronr_len']) ref_scores = np.matrix(psi_model.predict(variant['ref_seq'], overhang)) alt_scores = np.matrix(psi_model.predict(variant['alt_seq'], overhang)) scores = np.hstack([ref_scores, alt_scores]).tolist()[0] scores.extend([ predict_deltaLogitPsi(ref_scores, alt_scores)[0], predict_pathogenicity(ref_scores, alt_scores)[0] ]) sys.stdout.write('MMSPLICE-RESPONSE:' + ','.join(map(str, scores)) + '\n') sys.stdout.flush()
def test_vep_plugin(): gtf = 'tests/data/test.gtf' vcf = 'tests/data/test.vcf.gz' fasta = 'tests/data/hg19.nochr.chr17.fa' gtfIntervalTree = 'tests/data/test.pkl' # pickle exon interval Tree dl = SplicingVCFDataloader(gtfIntervalTree, fasta, vcf, out_file=gtfIntervalTree, split_seq=False, overhang=(100, 100)) model = MMSplice(exon_cut_l=0, exon_cut_r=0, acceptor_intron_cut=6, donor_intron_cut=6, acceptor_intron_len=50, acceptor_exon_len=3, donor_exon_len=5, donor_intron_len=13) df_python = predict_all_table(model, dl, batch_size=1024, split_seq=False, assembly=True, pathogenicity=True, splicing_efficiency=True) df_python_predictionsMax = max_varEff(df_python).set_index('ID') df_plugin = read_vep(vep_output) df_plugin_predictionsMax = max_varEff(df_plugin).set_index('ID') indexes = list( set(df_plugin_predictionsMax.index) & set(df_python_predictionsMax.index)) vep_plugin_dlogitPsi = df_plugin_predictionsMax.loc[indexes, 'mmsplice_dlogitPsi'] python_package = df_python_predictionsMax.loc[indexes, 'mmsplice_dlogitPsi'] assert pearsonr(vep_plugin_dlogitPsi, python_package)[0] >= 0.99
def test_predict_all_table(vcf_path): model = MMSplice(exon_cut_l=0, exon_cut_r=0, acceptor_intron_cut=6, donor_intron_cut=6, acceptor_intron_len=50, acceptor_exon_len=3, donor_exon_len=5, donor_intron_len=13) dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path) df = predict_all_table(model, dl, batch_size=1024, split_seq=False, assembly=True, pathogenicity=True, splicing_efficiency=True) assert len(df['mmsplice_dlogitPsi']) == len(variants) - 1
def writeMMSpliceToVcf(vcf_in, vcf_lowAF, vcf_out, gtf, fasta): lowFrequencyVariants(vcf_in, vcf_lowAF) # dataloader to load variants from vcf dl = SplicingVCFDataloader(gtf, fasta, vcf_lowAF, tissue_specific=False) # Specify model model = MMSplice() # Or predict and return as df predictions = predict_all_table(model, dl, pathogenicity=True, splicing_efficiency=True, progress = True) #generate hash dict = {} for row in predictions.itertuples(): id = row.ID string = exon_sep[0] + "exon:" + row.exons + "," + "delta_logit_psi:" + str(round(row.delta_logit_psi, 4)) + "," + "pathogenicity:" + str(round(row.pathogenicity, 4)) + exon_sep[1] if id in dict: dict[id] = dict[id] + string else: dict[id] = string writeTempVCF(vcf_in, vcf_out, dict)
action="store", dest="vcf", help="input gzipped vcf") parser.add_argument("--fasta", action="store", dest="fasta", help="reference genome fasta file") parser.add_argument("--gtf", action="store", dest="gtf", help="gtf file") parser.add_argument("--output", action="store", dest="output", help="output file for MMSplice results") args = parser.parse_args() # Specify model model = MMSplice() #dl = SplicingVCFDataloader(gtf, fasta, vcf, encode=False, tissue_specific=False) dl = SplicingVCFDataloader(args.gtf, args.fasta, args.vcf) # Or predict and return as df predictions = predict_all_table(model, dl, pathogenicity=True, splicing_efficiency=True) # Summerize with maximum effect size predictionsMax = max_varEff(predictions) writeVCF(args.vcf, args.output, predictionsMax)
from kipoi.model import BaseModel from mmsplice import MMSplice from mmsplice.utils import predict_splicing_efficiency mmsplice = MMSplice() class MMSpliceModel(BaseModel): '''Model to predict delta logit PSI''' def predict_on_batch(self, inputs): X_ref = mmsplice.predict_on_batch(inputs['seq']) X_alt = mmsplice.predict_on_batch(inputs['mut_seq']) return predict_splicing_efficiency(X_ref, X_alt)
import numpy as np from mmsplice import MMSplice from mmsplice.vcf_dataloader import SplicingVCFDataloader as BaseSplicingVCFDataloader model = MMSplice( exon_cut_l=0, exon_cut_r=0, acceptor_intron_cut=6, donor_intron_cut=6, acceptor_intron_len=50, acceptor_exon_len=3, donor_exon_len=5, donor_intron_len=13) class SplicingVCFDataloader(BaseSplicingVCFDataloader): def __next__(self): super_out = super().__next__() return { 'inputs': np.concatenate([ model.predict(super_out['inputs_mut']).values, model.predict(super_out['inputs']).values ]), 'metadata': super_out['metadata'] }
def test_mmsplice(): seq = 'ATGCGACGTACCCAGTAAAT' overhang = (4, 4) model = MMSplice() pred = model.predict(seq, overhang) assert len(pred) == 5
def test_mmsplice(): x = {'seq': "ATGCGACGTACCCAGTAAAT", 'intronl_len': 4, 'intronr_len': 4} model = MMSplice() pred = model.predict(x) assert len(pred) == 5