def test_splicing_vcf_loads_snps(vcf_path):
    dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path)

    expected_snps_seq = [{
        'seq':
        'CAAATCTTAAATTTACTTTATTTTAAAATGATAAAATGAAGTTGTCATTT'
        'TATAAACCTTTTAAAAAGATATATATATATGTTTTTCTAATGTGTTAAAG'
        'TTCATTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAG'
        'TACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGG'
        'TAAGTCAGCACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTCCTA'
        'TGCAAATGAACAGAATTGACCTTACATACTAGGGAAGAAAAGACATGTC',
        'alt_seq':
        'CAAATCTTAAATTTACTTTATTTTAAAATGATAAAATGAAGTTGTCATTT'
        'TATAAACCTTTTAAAAAGATATATATATATGTTTTTCTAATGTGTTAAAG'
        'TTCATTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAG'
        'TACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGC'
        'TAAGTCAGCACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTCCTA'
        'TGCAAATGAACAGAATTGACCTTACATACTAGGGAAGAAAAGACATGTC'
    }]

    for i in range(len(snps)):
        d = next(dl)
        print(d)
        print(d['metadata']['ExonInterval']['start'])
        print(d['metadata']['ExonInterval']['end'])
        assert d['inputs']['seq'] == expected_snps_seq[i]['seq']
        assert d['inputs_mut']['seq'] == expected_snps_seq[i]['alt_seq']
Beispiel #2
0
def test_vep_plugin():
    gtf = 'tests/data/test.gtf'
    vcf = 'tests/data/test.vcf.gz'
    fasta = 'tests/data/hg19.nochr.chr17.fa'

    dl = SplicingVCFDataloader(gtf, fasta, vcf)
    model = MMSplice()

    df_python = predict_all_table(model,
                                  dl,
                                  pathogenicity=True,
                                  splicing_efficiency=True)
    df_python_predictionsMax = max_varEff(df_python).set_index('ID')

    df_plugin = read_vep(vep_output)
    df_plugin_predictionsMax = max_varEff(df_plugin).set_index('ID')

    indexes = list(
        set(df_plugin_predictionsMax.index)
        & set(df_python_predictionsMax.index))

    vep_plugin_dlogitPsi = df_plugin_predictionsMax.loc[indexes,
                                                        'delta_logit_psi']
    python_package = df_python_predictionsMax.loc[indexes, 'delta_logit_psi']

    assert pearsonr(vep_plugin_dlogitPsi, python_package)[0] >= 0.95
Beispiel #3
0
def test_predict_all_table(vcf_path):
    model = MMSplice()

    dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path)
    df = predict_all_table(model,
                           dl,
                           pathogenicity=True,
                           splicing_efficiency=True)

    assert len(df['delta_logit_psi']) == len(variants) - 1
def test_vep_plugin():
    gtf = 'tests/data/test.gtf'
    vcf = 'tests/data/test.vcf.gz'
    fasta = 'tests/data/hg19.nochr.chr17.fa'
    gtfIntervalTree = 'tests/data/test.pkl'  # pickle exon interval Tree

    dl = SplicingVCFDataloader(gtfIntervalTree,
                               fasta,
                               vcf,
                               out_file=gtfIntervalTree,
                               split_seq=False,
                               overhang=(100, 100))

    model = MMSplice(exon_cut_l=0,
                     exon_cut_r=0,
                     acceptor_intron_cut=6,
                     donor_intron_cut=6,
                     acceptor_intron_len=50,
                     acceptor_exon_len=3,
                     donor_exon_len=5,
                     donor_intron_len=13)

    df_python = predict_all_table(model,
                                  dl,
                                  batch_size=1024,
                                  split_seq=False,
                                  assembly=True,
                                  pathogenicity=True,
                                  splicing_efficiency=True)
    df_python_predictionsMax = max_varEff(df_python).set_index('ID')

    df_plugin = read_vep(vep_output)
    df_plugin_predictionsMax = max_varEff(df_plugin).set_index('ID')

    indexes = list(
        set(df_plugin_predictionsMax.index)
        & set(df_python_predictionsMax.index))

    vep_plugin_dlogitPsi = df_plugin_predictionsMax.loc[indexes,
                                                        'mmsplice_dlogitPsi']
    python_package = df_python_predictionsMax.loc[indexes,
                                                  'mmsplice_dlogitPsi']

    assert pearsonr(vep_plugin_dlogitPsi, python_package)[0] >= 0.99
Beispiel #5
0
def test_predict_all_table(vcf_path):
    model = MMSplice(exon_cut_l=0,
                     exon_cut_r=0,
                     acceptor_intron_cut=6,
                     donor_intron_cut=6,
                     acceptor_intron_len=50,
                     acceptor_exon_len=3,
                     donor_exon_len=5,
                     donor_intron_len=13)

    dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path)
    df = predict_all_table(model,
                           dl,
                           batch_size=1024,
                           split_seq=False,
                           assembly=True,
                           pathogenicity=True,
                           splicing_efficiency=True)

    assert len(df['mmsplice_dlogitPsi']) == len(variants) - 1
Beispiel #6
0
def writeMMSpliceToVcf(vcf_in, vcf_lowAF, vcf_out, gtf, fasta):

    lowFrequencyVariants(vcf_in, vcf_lowAF)
    # dataloader to load variants from vcf
    dl = SplicingVCFDataloader(gtf, fasta, vcf_lowAF, tissue_specific=False)

    # Specify model
    model = MMSplice()

    # Or predict and return as df
    predictions = predict_all_table(model, dl, pathogenicity=True, splicing_efficiency=True, progress = True)

    #generate hash
    dict = {}
    for row in predictions.itertuples():
        id = row.ID
        string = exon_sep[0] + "exon:" + row.exons + "," + "delta_logit_psi:" + str(round(row.delta_logit_psi, 4)) + "," + "pathogenicity:" + str(round(row.pathogenicity, 4)) + exon_sep[1]
        if id in dict:
            dict[id] = dict[id] + string
        else:
            dict[id] = string    

    writeTempVCF(vcf_in, vcf_out, dict)
Beispiel #7
0
from mmsplice.vcf_dataloader import SplicingVCFDataloader
from mmsplice import MMSplice, predict_save

# files
gtf = 'data/chr1.gtf.gz'
fasta = 'data/chr1.fa'

vcf_benign = 'data/clinvar_chr1_benign.vcf.gz'
vcf_pathog = 'data/clinvar_chr1_pathogenic.vcf.gz'

pred_benign = 'pred_benign.csv'
pred_pathogen = 'pred_pathogen.csv'

dl_benign = SplicingVCFDataloader(gtf,
                                  fasta,
                                  vcf_benign,
                                  tissue_specific=False)
dl_pathogen = SplicingVCFDataloader(gtf,
                                    fasta,
                                    vcf_pathog,
                                    tissue_specific=False)

model = MMSplice()

predict_save(model,
             dl_benign,
             output_csv=pred_benign,
             pathogenicity=True,
             splicing_efficiency=True)

predict_save(model,
from mmsplice import MMSplice, predict_save
from mmsplice.vcf_dataloader import SplicingVCFDataloader

gtf = './data/chr1.gtf'
fasta = './data/chr1.fa'
vcf_ben = './data/clinvar_chr1_benign.vcf.gz'
vcf_pat = './data/clinvar_chr1_pathogenic.vcf.gz'

ben_dl = SplicingVCFDataloader(gtf, fasta, vcf_benign, tissue_specific=False)
pat_dl = SplicingVCFDataloader(gtf, fasta, vcf_pathogenic, tissue_specific=False)

model = MMSplice()

predict_save(model, ben_dl, output_csv="ben_preds.csv", pathogenicity=True, splicing_efficiency=True)
predict_save(model, pat_dl, output_csv="pat_preds.csv", pathogenicity=True, splicing_efficiency=True)
def test_SplicingVCFDataloader__encode_batch_seq(vcf_path):
    dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path)
    encoded = dl._encode_batch_seq({'acceptor': np.array(['ATT'])})
    np.testing.assert_array_equal(
        encoded['acceptor'][0],
        np.array([[1., 0., 0., 0.], [0., 0., 0., 1.], [0., 0., 0., 1.]]))
Beispiel #10
0
# Data files
gtf = "data/chr1.gtf"
vcf_ben = "data/clinvar_chr1_benign"
vcf_pat = "data/clinvar_chr1_pathogenic"
fasta = "data/chr1.fa"

# Repeat the process for benign and pathogenic variants
for vcf, typ in zip([vcf_ben, vcf_pat], ["benign", "pathogenic"]):
    print(f"\n\n\nRunning MMSPlice for {typ}.\n\n\n")

    # Filtering VCF file
    subprocess.call(
        f"echo 'Quality filtering has not been applied for {typ}.'",
        shell=True)
    subprocess.call(
        f"bcftools norm -m-both -o {vcf + '_temp.vcf'} {vcf + '.vcf.gz'}",
        shell=True)
    subprocess.call(
        f"bcftools norm -f {fasta} -o {vcf + '.vcf'} {vcf + '_temp.vcf'}",
        shell=True)
    vcf = vcf + '.vcf'

    # Run MMSplice
    dl = SplicingVCFDataloader(gtf, fasta, vcf, tissue_specific=False)
    model = MMSplice()
    predict_save(model,
                 dl,
                 f"mmsplice_output/predictions_{typ}.csv",
                 pathogenicity=True,
                 splicing_efficiency=True)
def test_SplicingVCFDataloader__chech_chrom_annotation():
    dl = SplicingVCFDataloader('grch37', fasta_file, vcf_file)
    chroms = {str(i) for i in range(1, 22)}.union(['X', 'Y', 'M'])
    assert len(chroms.difference(set(dl.pr_exons.Chromosome))) == 0
    assert sum(1 for i in dl) > 0
# read all transcripts from the gtf file
df = pd.read_csv(gtf, sep='\t', header=None)

# filter for transcripts of the gene of interest
g = df[8].str.contains(gene)
result = df[g]

# write the result to a csv file for reuse
new_gtf = ('Homo_sapiens_' + gene + '_all.GRCh37.75.gtf')
result.to_csv(new_gtf, sep='\t', index=False, header=None)

# predict the scores
# dataloader to load variants from vcf
# set tissue_specific = False for MMSplice and tissue_specific = True for MTSplice
dl = SplicingVCFDataloader(new_gtf,
                           fasta,
                           vcf,
                           tissue_specific=tissue_specificity)

# Specify model
model = MMSplice()

# predict  the scores
predictions = predict_all_table(model,
                                dl,
                                pathogenicity=True,
                                splicing_efficiency=True)
# Summarize the effect as the maximum across all exons
predictions = max_varEff(predictions)
predictions.to_csv('mmsplice_' + gene + '_' + variants + '.csv')
def test_SplicingVCFDataloader__read_exons(vcf_path):
    dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path)
    df_exons = dl._read_exons(gtf_file, overhang=(10, 20)).df
    row = df_exons[df_exons['exon_id'] == 'ENSE00003559512'].iloc[0]
    assert row['left_overhang'] == 10
    assert row['right_overhang'] == 20
Beispiel #14
0
                    action="store",
                    dest="vcf",
                    help="input gzipped vcf")
parser.add_argument("--fasta",
                    action="store",
                    dest="fasta",
                    help="reference genome fasta file")
parser.add_argument("--gtf", action="store", dest="gtf", help="gtf file")
parser.add_argument("--output",
                    action="store",
                    dest="output",
                    help="output file for MMSplice results")
args = parser.parse_args()

# Specify model
model = MMSplice()

#dl = SplicingVCFDataloader(gtf, fasta, vcf, encode=False, tissue_specific=False)
dl = SplicingVCFDataloader(args.gtf, args.fasta, args.vcf)

# Or predict and return as df
predictions = predict_all_table(model,
                                dl,
                                pathogenicity=True,
                                splicing_efficiency=True)

# Summerize with maximum effect size
predictionsMax = max_varEff(predictions)

writeVCF(args.vcf, args.output, predictionsMax)
Beispiel #15
0
from mmsplice import predict_save, MMSplice
from mmsplice.vcf_dataloader import SplicingVCFDataloader

dl = SplicingVCFDataloader('grch38', snakemake.input['fasta'],
                           snakemake.input['vcf'])
model = MMSplice()
predict_save(model, dl, output_csv=snakemake.output['result'])

def test_splicing_vcf_dataloader_prebuild_grch38(vcf_path):
    dl = SplicingVCFDataloader('grch38', fasta_file, vcf_path)
def test_SplicingVCFDataloader__next__(vcf_path):
    dl = SplicingVCFDataloader(gtf_file,
                               fasta_file,
                               vcf_path,
                               split_seq=False,
                               encode=False)
    dl._generator = iter([{
        'left_overhang': 100,
        'right_overhang': 100,
        'Chromosome': '17',
        'Start_exon': 41275934,
        'End_exon': 41276232,
        'Strand': '-',
        'exon_id': 'exon_id',
        'gene_id': 'gene_id',
        'gene_name': 'gene_name',
        'transcript_id': 'transcript_id',
        'variant': Variant('17', 41276033, 'C', ['G'])
    }])

    expected_snps_seq = {
        'seq':
        'CAAATCTTAAATTTACTTTATTTTAAAATGATAAAATGAAGTTGTCATTT'
        'TATAAACCTTTTAAAAAGATATATATATATGTTTTTCTAATGTGTTAAAG'
        'TTCATTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAG'
        'TACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGG'
        'TAAGTCAGCACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTCCTA'
        'TGCAAATGAACAGAATTGACCTTACATACTAGGGAAGAAAAGACATGTC',
        'alt_seq':
        'CAAATCTTAAATTTACTTTATTTTAAAATGATAAAATGAAGTTGTCATTT'
        'TATAAACCTTTTAAAAAGATATATATATATGTTTTTCTAATGTGTTAAAG'
        'TTCATTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAG'
        'TACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGC'
        'TAAGTCAGCACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTCCTA'
        'TGCAAATGAACAGAATTGACCTTACATACTAGGGAAGAAAAGACATGTC'
    }

    d = next(dl)

    assert d['inputs']['seq'] == expected_snps_seq['seq']
    assert d['inputs']['mut_seq'] == expected_snps_seq['alt_seq']

    dl._generator = iter([{
        'left_overhang': 100,
        'right_overhang': 0,
        'Chromosome': '17',
        'Start_exon': 41275934,
        'End_exon': 41276132,
        'Strand': '-',
        'exon_id': 'exon_id',
        'gene_id': 'gene_id',
        'gene_name': 'gene_name',
        'transcript_id': 'transcript_id',
        'variant': Variant('17', 41276033, 'C', ['G'])
    }])

    expected_snps_seq = {
        'seq':
        'TTCATTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAG'
        'TACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGG'
        'TAAGTCAGCACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTCCTA'
        'TGCAAATGAACAGAATTGACCTTACATACTAGGGAAGAAAAGACATGTC',
        'alt_seq':
        'TTCATTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAG'
        'TACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGC'
        'TAAGTCAGCACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTCCTA'
        'TGCAAATGAACAGAATTGACCTTACATACTAGGGAAGAAAAGACATGTC'
    }

    d = next(dl)
    assert d['inputs']['seq'] == expected_snps_seq['seq']
    assert d['inputs']['mut_seq'] == expected_snps_seq['alt_seq']
def test_splicing_vcf_loads_all(vcf_path):
    dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path)
    assert sum(1 for i in dl) == len(variants) - 1
Beispiel #19
0
####################################################################################################################
####################################################################################################################
# Get arguments passed from main code.
args_values = np.delete([sys.argv], [0])
args_keys = ['tumor', 'data_dir', 'lock_dir']
args_values = args_values.tolist()
args = dict(zip(args_keys, args_values))

# Get vcf file name with full path.
vcf_file = args['data_dir'] + args['tumor'] + '.vcf'

# Define dataloader.
dl = SplicingVCFDataloader(GTF,
                           FastaFile,
                           vcf_file,
                           encode=False,
                           split_seq=True)
# dl = SplicingVCFDataloader(GTF, FastaFile, vcf_file)

# Run the MMSplice models on the tumor.
model = MMSplice()
pred = predict_all_table(model,
                         dl,
                         pathogenicity=True,
                         splicing_efficiency=True)
pred_max = max_varEff(pred)

# Save the results.
pred_max.to_csv(args['lock_dir'] + args['tumor'] + '.txt',
                index=False,
def test_splicing_vcf_loads_deletions(vcf_path):
    dl = SplicingVCFDataloader(gtf_file, fasta_file, vcf_path)

    expected_snps_seq = [
        {
            'seq':
            'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
            'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTC'
            'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCA'
            'AGTAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCC'
            'TTCATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT',
            'alt_seq':
            'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
            'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTC'
            'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCG'
            'TAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCCTT'
            'CATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT'
        },
        {
            'seq':
            'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
            'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTC'
            'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCA'
            'AGTAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCC'
            'TTCATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT',
            'alt_seq':
            'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
            'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTC'
            'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCG'
            'TAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCCTT'
            'CATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAATTA'
        },
        # {
        #     'seq':
        #     'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
        #     'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTC'
        #     'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCA'
        #     'AGTAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCC'
        #     'TTCATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT',
        #     'alt_seq':
        #     'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
        #     'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTC'
        #     'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCT'
        #     'AGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCCTTCA'
        #     'TAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAATTA'
        # },
        {
            'seq':
            'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
            'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTC'
            'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCA'
            'AGTAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCC'
            'TTCATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT',
            'alt_seq':
            'ATAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAAT'
            'AAATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTATC'
            'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCA'
            'AGTAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCC'
            'TTCATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT'
        },
        {
            'seq':
            'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
            'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTC'
            'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCA'
            'AGTAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCC'
            'TTCATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT',
            'alt_seq':
            'CATAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAA'
            'TAAATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTTC'
            'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCA'
            'AGTAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCC'
            'TTCATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT'
        },
        {
            'seq':
            'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
            'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTC'
            'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCA'
            'AGTAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCC'
            'TTCATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT',
            'alt_seq':
            'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
            'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGCT'
            'GGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAA'
            'GTAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCCT'
            'TCATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT'
        },
        {
            'seq':
            'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
            'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTC'
            'TGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCA'
            'AGTAAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCC'
            'TTCATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT',
            'alt_seq':
            'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATA'
            'AATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGGT'
            'AAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCCTTC'
            'ATAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT'
        }
    ]

    for i in range(len(snps)):
        d = next(dl)

    for i in range(len(deletions) - 1):
        d = next(dl)
        print('Variant position:', d['metadata']['variant']['POS'])
        print('Interval:', d['metadata']['ExonInterval']['start'], '-',
              d['metadata']['ExonInterval']['end'])
        print(d)
        assert d['inputs']['seq'] == expected_snps_seq[i]['seq']
        assert d['inputs_mut']['seq'] == expected_snps_seq[i]['alt_seq']
def test_splicing_vcf_loads_insertions(vcf_path):
    dl = SplicingVCFDataloader(gtf_file,
                               fasta_file,
                               vcf_path,
                               split_seq=False,
                               encode=False)

    for i in range(len(snps) + len(deletions) - 1):
        d = next(dl)

    expected_snps_seq = [{
        'seq':
        'CAAATCTTAAATTTACTTTATTTTAAAATGATAAAATGAAGTTGTCATTTTA'
        'TAAACCTTTTAAAAAGATATATATATATGTTTTTCTAATGTGTTAAAGTTCA'
        'TTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAA'
        'ATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGGTAAGTCAG'
        'CACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTCCTATGCAAATGAA'
        'CAGAATTGACCTTACATACTAGGGAAGAAAAGACATGTC',
        'alt_seq':
        'CAAATCTTAAATTTACTTTATTTTAAAATGATAAAATGAAGTTGTCATTTTA'
        'TAAACCTTTTAAAAAGATATATATATATGTTTTTCTAATGTGTTAAAGTTCA'
        'TTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAA'
        'ATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGCATCTGGTA'
        'AGTCAGCACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTCCTATGCA'
        'AATGAACAGAATTGACCTTACATACTAGGGAAGAAAAGA'
    }, {
        'seq':
        'CAAATCTTAAATTTACTTTATTTTAAAATGATAAAATGAAGTTGTCATTTTA'
        'TAAACCTTTTAAAAAGATATATATATATGTTTTTCTAATGTGTTAAAGTTCA'
        'TTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAA'
        'ATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGGTAAGTCAG'
        'CACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTCCTATGCAAATGAA'
        'CAGAATTGACCTTACATACTAGGGAAGAAAAGACATGTC',
        'alt_seq':
        'CAAATCTTAAATTTACTTTATTTTAAAATGATAAAATGAAGTTGTCATTTTA'
        'TAAACCTTTTAAAAAGATATATATATATGTTTTTCTAATGTGTTAAAGTTCA'
        'TTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAA'
        'ATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGTGGTAAGTC'
        'AGCACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTCCTATGCAAATG'
        'AACAGAATTGACCTTACATACTAGGGAAGAAAAGACATG'
    }, {
        'seq':
        'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATAA'
        'ATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTCTG'
        'GAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAGT'
        'AAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCCTTCA'
        'TAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT',
        'alt_seq':
        'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATAAAT'
        'TATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTTTCTG'
        'GAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAGT'
        'AAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCCTTCA'
        'TAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT'
    }, {
        'seq':
        'TAACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATAA'
        'ATTATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTAGTCTG'
        'GAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAGT'
        'AAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCCTTCA'
        'TAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT',
        'alt_seq':
        'ACAGCTCAAAGTTGAACTTATTCACTAAGAATAGCTTTATTTTTAAATAAAT'
        'TATTGAGCCTCATTTATTTTCTTTTTCTCCCCCCCTACCCTGCTTTAGTCTG'
        'GAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAGT'
        'AAGTTTGAATGTGTTATGTGGCTCCATTATTAGCTTTTGTTTTTGTCCTTCA'
        'TAACCCAGGAAACACCTAACTTTATAGAAGCTTTACTTTCTTCAAT'
    }, {
        'seq':
        'CAAATCTTAAATTTACTTTATTTTAAAATGATAAAATGAAGTTGT'
        'CATTTTATAAACCTTTTAAAAAGATATATATATATGTTTTTCTAATGTGTTA'
        'AAGTTCATTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAA'
        'GTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGGT'
        'AAGTCAGCACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTCCTATGC'
        'AAATGAACAGAATTGACCTTACATACTAGGGAAGAAAAGACATGTC',
        'alt_seq':
        'CAAATCTTAAATTTACTTTATTTTAAAATGATAAAATGAAG'
        'TTGTCATTTTATAAACCTTTTAAAAAGATATATATATATGTTTTTCTAATGT'
        'GTTAAAGAGTTCATTGGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTT'
        'GAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCA'
        'TCTGGTAAGTCAGCACAAGAGTGTATTAATTTGGGATTCCTATGATTATCTC'
        'CTATGCAAATGAACAGAATTGACCTTACATACTAGGGAAGAAAAGACATGTC'
    }]

    for i in range(len(insertions)):

        d = next(dl)

        print(d)
        print(d['metadata']['exon']['start'])
        print(d['metadata']['exon']['end'])
        assert d['inputs']['seq'] == expected_snps_seq[i]['seq']
        assert d['inputs']['mut_seq'] == expected_snps_seq[i]['alt_seq']