Example #1
0
def test_vep_plugin():
    gtf = 'tests/data/test.gtf'
    vcf = 'tests/data/test.vcf.gz'
    fasta = 'tests/data/hg19.nochr.chr17.fa'

    dl = SplicingVCFDataloader(gtf, fasta, vcf)
    model = MMSplice()

    df_python = predict_all_table(model,
                                  dl,
                                  pathogenicity=True,
                                  splicing_efficiency=True)
    df_python_predictionsMax = max_varEff(df_python).set_index('ID')

    df_plugin = read_vep(vep_output)
    df_plugin_predictionsMax = max_varEff(df_plugin).set_index('ID')

    indexes = list(
        set(df_plugin_predictionsMax.index)
        & set(df_python_predictionsMax.index))

    vep_plugin_dlogitPsi = df_plugin_predictionsMax.loc[indexes,
                                                        'delta_logit_psi']
    python_package = df_python_predictionsMax.loc[indexes, 'delta_logit_psi']

    assert pearsonr(vep_plugin_dlogitPsi, python_package)[0] >= 0.95
Example #2
0
def test_predict_all_table(vcf_path):
    model = MMSplice()

    dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path)
    df = predict_all_table(model,
                           dl,
                           pathogenicity=True,
                           splicing_efficiency=True)

    assert len(df['delta_logit_psi']) == len(variants) - 1
Example #3
0
def test_predict_all_table_exon_dataloader(vcf_path):
    model = MMSplice()
    df_exons = pd.read_csv(exon_file)
    dl = ExonDataset(exon_file, fasta_file)
    df = predict_all_table(model,
                           dl,
                           pathogenicity=True,
                           splicing_efficiency=True)

    assert len(df['delta_logit_psi']) == df_exons.shape[0]
def test_vep_plugin():
    gtf = 'tests/data/test.gtf'
    vcf = 'tests/data/test.vcf.gz'
    fasta = 'tests/data/hg19.nochr.chr17.fa'
    gtfIntervalTree = 'tests/data/test.pkl'  # pickle exon interval Tree

    dl = SplicingVCFDataloader(gtfIntervalTree,
                               fasta,
                               vcf,
                               out_file=gtfIntervalTree,
                               split_seq=False,
                               overhang=(100, 100))

    model = MMSplice(exon_cut_l=0,
                     exon_cut_r=0,
                     acceptor_intron_cut=6,
                     donor_intron_cut=6,
                     acceptor_intron_len=50,
                     acceptor_exon_len=3,
                     donor_exon_len=5,
                     donor_intron_len=13)

    df_python = predict_all_table(model,
                                  dl,
                                  batch_size=1024,
                                  split_seq=False,
                                  assembly=True,
                                  pathogenicity=True,
                                  splicing_efficiency=True)
    df_python_predictionsMax = max_varEff(df_python).set_index('ID')

    df_plugin = read_vep(vep_output)
    df_plugin_predictionsMax = max_varEff(df_plugin).set_index('ID')

    indexes = list(
        set(df_plugin_predictionsMax.index)
        & set(df_python_predictionsMax.index))

    vep_plugin_dlogitPsi = df_plugin_predictionsMax.loc[indexes,
                                                        'mmsplice_dlogitPsi']
    python_package = df_python_predictionsMax.loc[indexes,
                                                  'mmsplice_dlogitPsi']

    assert pearsonr(vep_plugin_dlogitPsi, python_package)[0] >= 0.99
Example #5
0
def test_predict_all_table(vcf_path):
    model = MMSplice(exon_cut_l=0,
                     exon_cut_r=0,
                     acceptor_intron_cut=6,
                     donor_intron_cut=6,
                     acceptor_intron_len=50,
                     acceptor_exon_len=3,
                     donor_exon_len=5,
                     donor_intron_len=13)

    dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path)
    df = predict_all_table(model,
                           dl,
                           batch_size=1024,
                           split_seq=False,
                           assembly=True,
                           pathogenicity=True,
                           splicing_efficiency=True)

    assert len(df['mmsplice_dlogitPsi']) == len(variants) - 1
Example #6
0
def writeMMSpliceToVcf(vcf_in, vcf_lowAF, vcf_out, gtf, fasta):

    lowFrequencyVariants(vcf_in, vcf_lowAF)
    # dataloader to load variants from vcf
    dl = SplicingVCFDataloader(gtf, fasta, vcf_lowAF, tissue_specific=False)

    # Specify model
    model = MMSplice()

    # Or predict and return as df
    predictions = predict_all_table(model, dl, pathogenicity=True, splicing_efficiency=True, progress = True)

    #generate hash
    dict = {}
    for row in predictions.itertuples():
        id = row.ID
        string = exon_sep[0] + "exon:" + row.exons + "," + "delta_logit_psi:" + str(round(row.delta_logit_psi, 4)) + "," + "pathogenicity:" + str(round(row.pathogenicity, 4)) + exon_sep[1]
        if id in dict:
            dict[id] = dict[id] + string
        else:
            dict[id] = string    

    writeTempVCF(vcf_in, vcf_out, dict)
Example #7
0
                    action="store",
                    dest="vcf",
                    help="input gzipped vcf")
parser.add_argument("--fasta",
                    action="store",
                    dest="fasta",
                    help="reference genome fasta file")
parser.add_argument("--gtf", action="store", dest="gtf", help="gtf file")
parser.add_argument("--output",
                    action="store",
                    dest="output",
                    help="output file for MMSplice results")
args = parser.parse_args()

# Specify model
model = MMSplice()

#dl = SplicingVCFDataloader(gtf, fasta, vcf, encode=False, tissue_specific=False)
dl = SplicingVCFDataloader(args.gtf, args.fasta, args.vcf)

# Or predict and return as df
predictions = predict_all_table(model,
                                dl,
                                pathogenicity=True,
                                splicing_efficiency=True)

# Summerize with maximum effect size
predictionsMax = max_varEff(predictions)

writeVCF(args.vcf, args.output, predictionsMax)