Ejemplo n.º 1
0
def test_vep_plugin():
    gtf = 'tests/data/test.gtf'
    vcf = 'tests/data/test.vcf.gz'
    fasta = 'tests/data/hg19.nochr.chr17.fa'

    dl = SplicingVCFDataloader(gtf, fasta, vcf)
    model = MMSplice()

    df_python = predict_all_table(model,
                                  dl,
                                  pathogenicity=True,
                                  splicing_efficiency=True)
    df_python_predictionsMax = max_varEff(df_python).set_index('ID')

    df_plugin = read_vep(vep_output)
    df_plugin_predictionsMax = max_varEff(df_plugin).set_index('ID')

    indexes = list(
        set(df_plugin_predictionsMax.index)
        & set(df_python_predictionsMax.index))

    vep_plugin_dlogitPsi = df_plugin_predictionsMax.loc[indexes,
                                                        'delta_logit_psi']
    python_package = df_python_predictionsMax.loc[indexes, 'delta_logit_psi']

    assert pearsonr(vep_plugin_dlogitPsi, python_package)[0] >= 0.95
Ejemplo n.º 2
0
def test_predict_all_table(vcf_path):
    model = MMSplice()

    dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path)
    df = predict_all_table(model,
                           dl,
                           pathogenicity=True,
                           splicing_efficiency=True)

    assert len(df['delta_logit_psi']) == len(variants) - 1
Ejemplo n.º 3
0
def test_predict_all_table_exon_dataloader(vcf_path):
    model = MMSplice()
    df_exons = pd.read_csv(exon_file)
    dl = ExonDataset(exon_file, fasta_file)
    df = predict_all_table(model,
                           dl,
                           pathogenicity=True,
                           splicing_efficiency=True)

    assert len(df['delta_logit_psi']) == df_exons.shape[0]
Ejemplo n.º 4
0
def test_exon_model_masking():
    model = MMSplice()

    preds = [
        model.exonM.predict(encodeDNA(['AAA']))[0][0],
        model.exonM.predict(encodeDNA(['AAA', 'CATACA']))[0][0],
        model.exonM.predict(encodeDNA(['AAA', 'CATACAGGAA']))[0][0]
    ]

    for i in preds:
        assert abs(preds[0] - i) < 1e-6
Ejemplo n.º 5
0
def run():
    options = json.loads(sys.stdin.readline().strip())

    K.clear_session()
    psi_model = MMSplice(
        **{k: v for k, v in options.items() if v})
    psi_model.spliter = SeqSpliter(pattern_warning=False)

    # warms up the model
    psi_model.predict("A" * 100, (4, 4))

    sys.stdout.write('MMSPLICE-RESPONSE:' + '1\n')
    sys.stdout.flush()

    while True:
        variant = json.loads(sys.stdin.readline().strip())

        overhang = (variant['intronl_len'], variant['intronr_len'])
        ref_scores = np.matrix(psi_model.predict(variant['ref_seq'], overhang))
        alt_scores = np.matrix(psi_model.predict(variant['alt_seq'], overhang))
        scores = np.hstack([ref_scores, alt_scores]).tolist()[0]

        scores.extend([
            predict_deltaLogitPsi(ref_scores, alt_scores)[0],
            predict_pathogenicity(ref_scores, alt_scores)[0]
        ])

        sys.stdout.write('MMSPLICE-RESPONSE:' +
                         ','.join(map(str, scores)) + '\n')
        sys.stdout.flush()
Ejemplo n.º 6
0
def test_vep_plugin():
    gtf = 'tests/data/test.gtf'
    vcf = 'tests/data/test.vcf.gz'
    fasta = 'tests/data/hg19.nochr.chr17.fa'
    gtfIntervalTree = 'tests/data/test.pkl'  # pickle exon interval Tree

    dl = SplicingVCFDataloader(gtfIntervalTree,
                               fasta,
                               vcf,
                               out_file=gtfIntervalTree,
                               split_seq=False,
                               overhang=(100, 100))

    model = MMSplice(exon_cut_l=0,
                     exon_cut_r=0,
                     acceptor_intron_cut=6,
                     donor_intron_cut=6,
                     acceptor_intron_len=50,
                     acceptor_exon_len=3,
                     donor_exon_len=5,
                     donor_intron_len=13)

    df_python = predict_all_table(model,
                                  dl,
                                  batch_size=1024,
                                  split_seq=False,
                                  assembly=True,
                                  pathogenicity=True,
                                  splicing_efficiency=True)
    df_python_predictionsMax = max_varEff(df_python).set_index('ID')

    df_plugin = read_vep(vep_output)
    df_plugin_predictionsMax = max_varEff(df_plugin).set_index('ID')

    indexes = list(
        set(df_plugin_predictionsMax.index)
        & set(df_python_predictionsMax.index))

    vep_plugin_dlogitPsi = df_plugin_predictionsMax.loc[indexes,
                                                        'mmsplice_dlogitPsi']
    python_package = df_python_predictionsMax.loc[indexes,
                                                  'mmsplice_dlogitPsi']

    assert pearsonr(vep_plugin_dlogitPsi, python_package)[0] >= 0.99
Ejemplo n.º 7
0
def test_predict_all_table(vcf_path):
    model = MMSplice(exon_cut_l=0,
                     exon_cut_r=0,
                     acceptor_intron_cut=6,
                     donor_intron_cut=6,
                     acceptor_intron_len=50,
                     acceptor_exon_len=3,
                     donor_exon_len=5,
                     donor_intron_len=13)

    dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path)
    df = predict_all_table(model,
                           dl,
                           batch_size=1024,
                           split_seq=False,
                           assembly=True,
                           pathogenicity=True,
                           splicing_efficiency=True)

    assert len(df['mmsplice_dlogitPsi']) == len(variants) - 1
Ejemplo n.º 8
0
def writeMMSpliceToVcf(vcf_in, vcf_lowAF, vcf_out, gtf, fasta):

    lowFrequencyVariants(vcf_in, vcf_lowAF)
    # dataloader to load variants from vcf
    dl = SplicingVCFDataloader(gtf, fasta, vcf_lowAF, tissue_specific=False)

    # Specify model
    model = MMSplice()

    # Or predict and return as df
    predictions = predict_all_table(model, dl, pathogenicity=True, splicing_efficiency=True, progress = True)

    #generate hash
    dict = {}
    for row in predictions.itertuples():
        id = row.ID
        string = exon_sep[0] + "exon:" + row.exons + "," + "delta_logit_psi:" + str(round(row.delta_logit_psi, 4)) + "," + "pathogenicity:" + str(round(row.pathogenicity, 4)) + exon_sep[1]
        if id in dict:
            dict[id] = dict[id] + string
        else:
            dict[id] = string    

    writeTempVCF(vcf_in, vcf_out, dict)
Ejemplo n.º 9
0
                    action="store",
                    dest="vcf",
                    help="input gzipped vcf")
parser.add_argument("--fasta",
                    action="store",
                    dest="fasta",
                    help="reference genome fasta file")
parser.add_argument("--gtf", action="store", dest="gtf", help="gtf file")
parser.add_argument("--output",
                    action="store",
                    dest="output",
                    help="output file for MMSplice results")
args = parser.parse_args()

# Specify model
model = MMSplice()

#dl = SplicingVCFDataloader(gtf, fasta, vcf, encode=False, tissue_specific=False)
dl = SplicingVCFDataloader(args.gtf, args.fasta, args.vcf)

# Or predict and return as df
predictions = predict_all_table(model,
                                dl,
                                pathogenicity=True,
                                splicing_efficiency=True)

# Summerize with maximum effect size
predictionsMax = max_varEff(predictions)

writeVCF(args.vcf, args.output, predictionsMax)
Ejemplo n.º 10
0
from kipoi.model import BaseModel
from mmsplice import MMSplice
from mmsplice.utils import predict_splicing_efficiency

mmsplice = MMSplice()


class MMSpliceModel(BaseModel):
    '''Model to predict delta logit PSI'''
    def predict_on_batch(self, inputs):
        X_ref = mmsplice.predict_on_batch(inputs['seq'])
        X_alt = mmsplice.predict_on_batch(inputs['mut_seq'])
        return predict_splicing_efficiency(X_ref, X_alt)
Ejemplo n.º 11
0
import numpy as np
from mmsplice import MMSplice
from mmsplice.vcf_dataloader import SplicingVCFDataloader as BaseSplicingVCFDataloader

model = MMSplice(
    exon_cut_l=0,
    exon_cut_r=0,
    acceptor_intron_cut=6,
    donor_intron_cut=6,
    acceptor_intron_len=50,
    acceptor_exon_len=3,
    donor_exon_len=5,
    donor_intron_len=13)


class SplicingVCFDataloader(BaseSplicingVCFDataloader):
    def __next__(self):
        super_out = super().__next__()
        return {
            'inputs': np.concatenate([
                model.predict(super_out['inputs_mut']).values,
                model.predict(super_out['inputs']).values
            ]),
            'metadata': super_out['metadata']
        }
Ejemplo n.º 12
0
def test_mmsplice():
    seq = 'ATGCGACGTACCCAGTAAAT'
    overhang = (4, 4)
    model = MMSplice()
    pred = model.predict(seq, overhang)
    assert len(pred) == 5
Ejemplo n.º 13
0
def test_mmsplice():
    x = {'seq': "ATGCGACGTACCCAGTAAAT", 'intronl_len': 4, 'intronr_len': 4}
    model = MMSplice()
    pred = model.predict(x)
    assert len(pred) == 5