def test_add_info_table(): vcf = viola.read_vcf(os.path.join(HERE, 'data/test.manta.vcf')) vcf_expected = viola.read_vcf( os.path.join(HERE, 'data/test.info.added.manta.vcf')) table = vcf.svlen table.columns = ['id', 'value_idx', 'test'] vcf.add_info_table('test', table, 1, 'Integer', 'test info') assert_vcf_equal(vcf, vcf_expected)
def test_drop_by_id(): vcf = viola.read_vcf(os.path.join(HERE, 'data/test.manta.vcf')) vcf_dropped_expected = viola.read_vcf(os.path.join(HERE, 'data/test.dropped.manta.vcf')) vcf_dropped2_expected = viola.read_vcf(os.path.join(HERE, 'data/test.dropped2.manta.vcf')) vcf_dropped = vcf.drop_by_id('test1') vcf_dropped2 = vcf.drop_by_id(['test2', 'test3']) assert_vcf_equal(vcf_dropped, vcf_dropped_expected) assert_vcf_equal(vcf_dropped2, vcf_dropped2_expected)
def generate_feature_matrix(input_dir, input_files, input_files_id, format_, caller, svtype_col_name, as_breakpoint, definitions, output): """ Generate feature matrix from VCF or BEDPE files. """ if format_ == 'bedpe': if (input_dir is None) & (input_files is None): return elif (input_files is None): data = viola.read_bedpe_multi(input_dir, svtype_col_name=svtype_col_name) elif (input_dir is None): ls_input = input_files.split(',') ls_bedpe = [ viola.read_bedpe(path, svtype_col_name=svtype_col_name) for path in ls_input ] if input_files_id is None: ls_names = range(len(ls_bedpe)) else: ls_names = input_files_id.split(',') data = viola.MultiBedpe(ls_bedpe, ls_names) else: return else: if (input_dir is None) & (input_files is None): return elif (input_files is None): data = viola.read_vcf_multi(input_dir, variant_caller=caller, as_breakpoint=as_breakpoint) elif (input_dir is None): ls_input = input_files.split(',') if as_breakpoint: ls_vcf = [ viola.read_vcf( path, variant_caller=caller).breakend2breakpoint() for path in ls_input ] else: ls_vcf = [ viola.read_vcf(path, variant_caller=caller) for path in ls_input ] if input_files_id is None: ls_names = range(len(ls_vcf)) else: ls_names = input_files_id.split(',') data = viola.MultiBedpe(ls_vcf, ls_names) else: return result = data.classify_manual_svtype(definitions=definitions) result.to_csv(output, sep='\t')
def test_merge3(): manta = viola.read_vcf(StringIO(DATA1), variant_caller='manta') delly = viola.read_vcf(StringIO(DATA2), variant_caller='delly') result = viola.merge([delly, manta], mode='confidence_intervals', integration=True) assert result.sv_count == 4 assert result.get_ids() == { 'manta_M1', 'delly_MD1', 'delly_MDL1', 'delly_D1' }
def test_merge(): manta = viola.read_vcf(os.path.join(HERE, 'data/test.merge.manta.vcf'), variant_caller='manta') delly = viola.read_vcf(os.path.join(HERE, 'data/test.merge.delly.vcf'), variant_caller='delly') lumpy = viola.read_vcf(os.path.join(HERE, 'data/test.merge.lumpy.vcf'), variant_caller='lumpy') gridss = viola.read_vcf(os.path.join(HERE, 'data/test.merge.gridss.vcf'), variant_caller='gridss') merged = manta.merge(threshold=100, ls_caller_names=['manta', 'delly', 'lumpy', 'gridss'], ls_vcf=[manta, delly, lumpy, gridss])
def test_merge2(): gridss = viola.read_vcf(StringIO(DATA1), variant_caller='gridss').breakend2breakpoint() manta = viola.read_vcf(StringIO(DATA2), variant_caller='manta').breakend2breakpoint() result = viola.merge([manta, gridss], mode='confidence_intervals', integration=True) result = result.filter('supportingcallercount > 1') assert result.sv_count == 5 assert result.get_ids() == { 'manta_MantaDUP:TANDEM:88695:0:1:0:0:0', 'manta_viola_breakpoint:0', 'manta_MantaDUP:TANDEM:93040:0:1:0:0:0', 'manta_viola_breakpoint:1', 'manta_viola_breakpoint:2' }
def test_merge1(): gridss = viola.read_vcf(StringIO(DATA1), variant_caller='gridss').breakend2breakpoint() manta = viola.read_vcf(StringIO(DATA2), variant_caller='manta').breakend2breakpoint() result = viola.merge([gridss, manta], mode='confidence_intervals', integration=True) result = result.filter('supportingcallercount > 1') assert result.sv_count == 5 assert result.get_ids() == { 'gridss_viola_breakpoint:0', 'gridss_viola_breakpoint:1', 'gridss_viola_breakpoint:6', 'gridss_viola_breakpoint:7', 'gridss_viola_breakpoint:8', }
def test_copy(): manta_path = os.path.join(HERE, '../io/data/test.manta.vcf') delly_path = os.path.join(HERE, '../io/data/test.delly.vcf') lumpy_path = os.path.join(HERE, '../io/data/test.lumpy.vcf') gridss_path = os.path.join(HERE, '../io/data/test.gridss.vcf') manta_vcf = viola.read_vcf(manta_path) delly_vcf = viola.read_vcf(delly_path, variant_caller='delly') lumpy_vcf = viola.read_vcf(lumpy_path, variant_caller='lumpy') gridss_vcf = viola.read_vcf(gridss_path, variant_caller='gridss') assert_vcf_equal(manta_vcf, manta_vcf.copy()) assert_vcf_equal(delly_vcf, delly_vcf.copy()) assert_vcf_equal(lumpy_vcf, lumpy_vcf.copy()) assert_vcf_equal(gridss_vcf, gridss_vcf.copy())
class TestToVcf: #manta_path = os.path.join(HERE, 'data/manta1.inv.vcf') #result = viola.read_vcf(manta_path) body = """chr10 39984191 43 N <INV> . . SVTYPE=INV;STRANDS=++:4,--:2;SVLEN=176;END=39984367;CIPOS=-8,11;CIEND=-17,31;CIPOS95=-3,6;CIEND95=-1,15;IMPRECISE;SU=6;PE=6;SR=0 GT:SU:PE:SR ./.:3:3:0 ./.:3:3:0 chr10 121422696 788 N <DEL> . . SVTYPE=DEL;STRANDS=+-:8;SVLEN=-21;END=121422717;CIPOS=-9,8;CIEND=-9,8;CIPOS95=0,0;CIEND95=0,0;SU=8;PE=0;SR=8 GT:SU:PE:SR ./.:8:0:8 ./.:0:0:0 chr14 45812810 7294_1 N [chr14:65332314[N . . SVTYPE=BND;STRANDS=--:10;EVENT=7294;MATEID=7294_2;CIPOS=-7,6;CIEND=-7,6;CIPOS95=0,0;CIEND95=0,0;SU=10;PE=0;SR=10 GT:SU:PE:SR ./.:10:0:10 ./.:0:0:0 chr14 65332314 7294_2 N [chr14:45812810[N . . SVTYPE=BND;STRANDS=--:10;SECONDARY;EVENT=7294;MATEID=7294_1;CIPOS=-7,6;CIEND=-7,6;CIPOS95=0,0;CIEND95=0,0;SU=10;PE=0;SR=10 GT:SU:PE:SR ./.:10:0:10 ./.:0:0:0 chr15 60882465 8801 N <DUP> . . SVTYPE=DUP;STRANDS=-+:41;SVLEN=3099777;END=63982242;CIPOS=-3,1;CIEND=-3,2;CIPOS95=0,0;CIEND95=0,0;SU=41;PE=19;SR=22 GT:SU:PE:SR ./.:41:19:22 ./.:0:0:0 """ expected_out = """chr10 39984191 43_1 N <INV> . PASS SVTYPE=INV;STRANDS=++:4;EVENT=43;SVLEN=176;END=39984367;CIPOS=-8,11;CIEND=-17,31;CIPOS95=-3,6;CIEND95=-1,15;IMPRECISE;SU=4;PE=6;SR=0;SUORG=6 GT:SU:PE:SR ./.:3:3:0 ./.:3:3:0 chr10 39984191 43_2 N <INV> . PASS SVTYPE=INV;STRANDS=--:2;EVENT=43;SVLEN=176;END=39984367;CIPOS=-8,11;CIEND=-17,31;CIPOS95=-3,6;CIEND95=-1,15;IMPRECISE;SU=2;PE=6;SR=0;SUORG=6 GT:SU:PE:SR ./.:3:3:0 ./.:3:3:0 chr10 121422696 788 N <DEL> . PASS SVTYPE=DEL;STRANDS=+-:8;SVLEN=-21;END=121422717;CIPOS=-9,8;CIEND=-9,8;CIPOS95=0,0;CIEND95=0,0;SU=8;PE=0;SR=8 GT:SU:PE:SR ./.:8:0:8 ./.:0:0:0 chr14 45812810 7294_1 N [chr14:65332314[N . PASS SVTYPE=BND;STRANDS=--:10;EVENT=7294;MATEID=7294_2;CIPOS=-7,6;CIEND=-7,6;CIPOS95=0,0;CIEND95=0,0;SU=10;PE=0;SR=10 GT:SU:PE:SR ./.:10:0:10 ./.:0:0:0 chr14 65332314 7294_2 N [chr14:45812810[N . PASS SVTYPE=BND;STRANDS=--:10;SECONDARY;EVENT=7294;MATEID=7294_1;CIPOS=-7,6;CIEND=-7,6;CIPOS95=0,0;CIEND95=0,0;SU=10;PE=0;SR=10 GT:SU:PE:SR ./.:10:0:10 ./.:0:0:0 chr15 60882465 8801 N <DUP> . PASS SVTYPE=DUP;STRANDS=-+:41;SVLEN=3099777;END=63982242;CIPOS=-3,1;CIEND=-3,2;CIPOS95=0,0;CIEND95=0,0;SU=41;PE=19;SR=22 GT:SU:PE:SR ./.:41:19:22 ./.:0:0:0 """ b = StringIO(HEADER + body) result = viola.read_vcf(b, variant_caller='lumpy') def test_to_vcf_like(self): df_vcf = self.result.to_vcf_like() df_expected = pd.read_csv(StringIO(self.expected_out), index_col=False, names=df_vcf.columns, sep='\t') pd.testing.assert_frame_equal(df_vcf, df_expected)
def test_classify_manual_svtype(): vcf = viola.read_vcf(os.path.join(HERE, 'data/manta1.vcf')) vcf2 = vcf.copy() vcf = vcf.breakend2breakpoint() vcf2 = vcf2.breakend2breakpoint() multi_vcf = viola.MultiVcf([vcf, vcf2], ['vcf1', 'vcf2']) ls_conditions = [ small_del, large_del, small_dup, large_dup, small_inv, tra ] ls_names = [ 'small_del', 'large_del', 'small_dup', 'large_dup', 'small_inv', 'tra' ] result = multi_vcf.classify_manual_svtype(ls_conditions=ls_conditions, ls_names=ls_names) manual_sv_type = multi_vcf.manual_sv_type manual_sv_type.set_index('id', inplace=True) manual_sv_type_expected = pd.read_csv(StringIO(data_expected), sep='\t', names=('id', 'value_idx', 'manual_sv_type')) manual_sv_type_expected.set_index('id', inplace=True) pd.testing.assert_frame_equal(manual_sv_type, manual_sv_type_expected, check_like=True) result_expected = pd.DataFrame([[2, 3, 1, 0, 2, 2, 1], [2, 3, 1, 0, 2, 2, 1]]) result_expected.columns = ls_names + ['others'] result_expected.columns.name = 'manual_sv_type' result_expected.index = ['vcf1', 'vcf2'] result_expected.index.name = 'patients' pd.testing.assert_frame_equal(result, result_expected)
class TestReadVcfLumpy: vcf_lumpy_inv = viola.read_vcf(lumpy_body_inv_buf, variant_caller='lumpy') def test_read_vcf_lumpy_inv(self): vcf = self.vcf_lumpy_inv # assert positions table equal df_pos_expected = pd.read_csv( StringIO( """id,chrom1,pos1,chrom2,pos2,strand1,strand2,ref,alt,qual,svtype 43_1,chr10,39984191,chr10,39984367,+,+,N,<INV>,None,INV 43_2,chr10,39984192,chr10,39984367,-,-,N,<INV>,None,INV""")) df_pos_expected['qual'] = None pd.testing.assert_frame_equal(vcf.positions, df_pos_expected) # assert strands table equal df_info_strands_expected = pd.read_csv( StringIO("""id,value_idx,strands 43_1,0,++:4 43_2,0,--:2""")) pd.testing.assert_frame_equal(vcf.strands, df_info_strands_expected) # assert su table equal df_info_su_expected = pd.read_csv( StringIO("""id,value_idx,su 43_1,0,4 43_2,0,2""")) pd.testing.assert_frame_equal(vcf.su, df_info_su_expected) # assert suorg table equal df_info_suorg_expected = pd.read_csv( StringIO("""id,value_idx,suorg 43_1,0,6 43_2,0,6""")) pd.testing.assert_frame_equal(vcf.suorg, df_info_suorg_expected)
def test_replace_svid(): vcf = viola.read_vcf(StringIO(HEADER + body)) vcf_expected1 = viola.read_vcf(StringIO(HEADER + body_expected1)) vcf_expected2 = viola.read_vcf(StringIO(HEADER + body_expected2)) vcf1 = vcf.copy() vcf1.replace_svid('test1', 'a') assert_vcf_equal(vcf1, vcf_expected1) vcf1_ex = vcf.copy() vcf1_ex.replace_svid(['test1'], ['a']) assert_vcf_equal(vcf1_ex, vcf_expected1) vcf2 = vcf.copy() vcf2.replace_svid(['test2', 'test3'], ['a', 'b']) assert_vcf_equal(vcf2, vcf_expected2)
def test_breakend2breakpoint(): vcf = viola.read_vcf(StringIO(HEADER + body), variant_caller='lumpy') vcf_expected = viola.read_vcf(StringIO(HEADER_expected + body_expected), variant_caller='lumpy') ex_svpos = vcf_expected.get_table('positions') ex_svpos.loc[3, 'svtype'] = 'INV' ex_svtype = vcf_expected.get_table('svtype') ex_svtype.iloc[3, 2] = 'INV' ex_infos_meta = vcf_expected.get_table('infos_meta') ex_infos_meta.index = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 17 ] vcf_expected._odict_alltables['positions'] = ex_svpos vcf_expected._odict_alltables['svtype'] = ex_svtype vcf_expected._odict_alltables['infos_meta'] = ex_infos_meta vcf_result = vcf.breakend2breakpoint() assert_vcf_equal(vcf_result, vcf_expected)
def test_filter_by_id(self): body_expected = """chr1 82550461 test1 G <DEL> . MinSomaticScore END=82554225;SVTYPE=DEL;SVLEN=-3764;IMPRECISE;CIPOS=-51,52;CIEND=-51,52;SOMATIC;SOMATICSCORE=10 PR:SR 21,0:10,0 43,4:15,3 chr1 60567906 test3 T <DEL> . MinSomaticScore END=60675940;SVTYPE=DEL;SVLEN=-108034;CIPOS=-44,44;CIEND=-38,39;SOMATIC;SOMATICSCORE=18 PR 23,0 44,6 """ expected_b = StringIO(HEADER + body_expected) obj_expected = viola.read_vcf(expected_b) obj_result = self.obj.filter_by_id(['test1', 'test3']) viola.testing.assert_vcf_equal(obj_expected, obj_result)
def test_get_table(): data = viola.read_vcf(os.path.join(HERE, 'data/test.manta.vcf')) # get positions table positions = data.get_table('positions') ## test for TableNotFoundError with pytest.raises(TableNotFoundError): data.get_table("notexists")
def vcf2bedpe(caller, info, filter_, format_, vcf): """ Convert a VCF file into a BEDPE file. A VCF argument is the path to the input VCF file. """ if isinstance(vcf, TextIOWrapper): vcf_obj = viola.read_vcf(StringIO(vcf.read()), variant_caller=caller) else: vcf_obj = viola.read_vcf(vcf, variant_caller=caller) if info is not None: ls_info = info.split(',') ls_info_lower = [i.lower() for i in ls_info] else: ls_info_lower = [] bedpe_like = vcf_obj.to_bedpe_like(custom_infonames=ls_info_lower, add_filters=filter_, add_formats=format_) click.echo(bedpe_like.to_string(index=None))
def test_merge(): manta = viola.read_vcf(os.path.join(HERE, 'data/test.merge.manta.vcf'), variant_caller='manta') delly = viola.read_vcf(os.path.join(HERE, 'data/test.merge.delly.vcf'), variant_caller='delly') lumpy = viola.read_vcf(os.path.join(HERE, 'data/test.merge.lumpy.vcf'), variant_caller='lumpy') gridss = viola.read_vcf(os.path.join(HERE, 'data/test.merge.gridss.vcf'), variant_caller='gridss') result = manta._generate_distance_matrix_by_confidence_intervals( viola.TmpVcfForMerge([manta, delly, gridss, lumpy], ['manta', 'delly', 'gridss', 'lumpy'])) result = viola.merge([manta, delly, lumpy, gridss], mode='confidence_intervals') assert result.get_ids() == { 'manta_M1', 'manta_MD1', 'manta_ML1', 'manta_MG1', 'manta_MDL1', 'manta_MDG1', 'manta_MLG1', 'manta_MDLG1o', 'delly_D1', 'delly_DL1', 'delly_DG1', 'delly_DLG1', 'lumpy_L1', 'lumpy_LG1', 'gridss_G1o' }
class TestWriteVcf: # get vcf class #manta_path = os.path.join(HERE, 'data/manta1.inv.vcf') manta_path = os.path.join(HERE, 'data/test.manta.vcf') lumpy_path = os.path.join(HERE, 'data/test.lumpy.vcf') delly_path = os.path.join(HERE, 'data/test.delly.vcf') gridss_path = os.path.join(HERE, 'data/test.gridss.vcf') vcf_manta = viola.read_vcf(manta_path) vcf_lumpy = viola.read_vcf(lumpy_path, variant_caller='lumpy') vcf_delly = viola.read_vcf(delly_path, variant_caller='delly') vcf_gridss = viola.read_vcf(gridss_path, variant_caller='gridss') def test_write_vcf_manta(self): # self.df.to_csv('path/to/csv') self.vcf_manta.to_vcf('tests/io/output/write_vcf_manta.vcf') #assert filecmp.cmp('tests/io/output/write_vcf.vcf', 'tests/io/data/manta1.inv.vcf') assert filecmp.cmp('tests/io/output/write_vcf_manta.vcf', 'tests/io/data/test.manta.validation.vcf') def test_write_vcf_lumpy(self): # self.df.to_csv('path/to/csv') self.vcf_lumpy.to_vcf('tests/io/output/write_vcf_lumpy.vcf') #assert filecmp.cmp('tests/io/output/write_vcf.vcf', 'tests/io/data/manta1.inv.vcf') assert filecmp.cmp('tests/io/output/write_vcf_lumpy.vcf', 'tests/io/data/test.lumpy.validation.vcf') def test_write_vcf_delly(self): self.vcf_delly.to_vcf('tests/io/output/write_vcf_delly.vcf') assert filecmp.cmp('tests/io/output/write_vcf_delly.vcf', 'tests/io/data/test.delly.validation.vcf') def test_write_vcf_gridss(self): self.vcf_gridss.to_vcf('tests/io/output/write_vcf_gridss.vcf') assert filecmp.cmp('tests/io/output/write_vcf_gridss.vcf', 'tests/io/data/test.gridss.validation.vcf') def test_write_info(self): self.vcf_manta.to_vcf('tests/io/output/write_info.vcf', onlyinfo=True) #assert filecmp.cmp('tests/io/output/write_info.vcf', 'tests/io/data/manta1.inv_info.vcf') assert filecmp.cmp('tests/io/output/write_info.vcf', 'tests/io/data/test_info.vcf')
class TestAppendInfos: manta_path = os.path.join(HERE, 'data/test.manta.vcf') reader = viola.read_vcf(manta_path) positions = reader.get_table('positions') def test_append_svlen(self): if 'svlen' in self.reader.table_list: result = viola.Vcf.append_infos(self.reader, self.positions, ls_tablenames=['svlen']) assert result.columns[-1] == 'svlen_0' assert result['svlen_0'].notnull().any()
class TestFilters: body = """chr1 82550461 test1 G <DEL> . MinSomaticScore END=82554225;SVTYPE=DEL;SVLEN=-3764;IMPRECISE;CIPOS=-51,52;CIEND=-51,52;SOMATIC;SOMATICSCORE=10 PR:SR 21,0:10,0 43,4:15,3 chr1 22814216 test2 T <INV> . MinSomaticScore END=92581131;SVTYPE=INV;SVLEN=69766915;IMPRECISE;CIPOS=-51,51;CIEND=-89,90;SOMATIC;SOMATICSCORE=11;INV5 PR 24,0 35,5 chr1 60567906 test3 T <DEL> . MinSomaticScore END=60675940;SVTYPE=DEL;SVLEN=-108034;CIPOS=-44,44;CIEND=-38,39;SOMATIC;SOMATICSCORE=18 PR 23,0 44,6 chr1 69583190 test4 T <DEL> . PASS END=69590947;SVTYPE=DEL;SVLEN=-7757;IMPRECISE;CIPOS=-123,123;CIEND=-135,136;SOMATIC;SOMATICSCORE=47 PR 21,0 20,12 """ data = HEADER + body b = StringIO(data) obj = viola.read_vcf(b) def test_filter_infos(self): result_svlen = self.obj._filter_infos('svlen', 0, operator="<", threshold=10000) result_svtype = self.obj._filter_infos('svtype', 0, operator='==', threshold='INV') assert result_svlen == {'test1', 'test3', 'test4'} assert result_svtype == {'test2'} def test_filter_infos_flag(self): result_imprecise = self.obj._filter_infos_flag('imprecise', exclude=False) result_imprecise_ex = self.obj._filter_infos_flag('imprecise', exclude=True) assert result_imprecise == {'test1', 'test2', 'test4'} assert result_imprecise_ex == {'test3'} def test_filter_filters(self): result_filter = self.obj._filter_filters('PASS', exclude=False) result_filter_ex = self.obj._filter_filters('PASS', exclude=True) assert result_filter == {'test4'} assert result_filter_ex == {'test1', 'test2', 'test3'} def test_filter_formats(self): result_format = self.obj._filter_formats('mouse1_T', 'PR', 1, '>', 5) assert result_format == {'test3', 'test4'} def test_filter_by_id(self): body_expected = """chr1 82550461 test1 G <DEL> . MinSomaticScore END=82554225;SVTYPE=DEL;SVLEN=-3764;IMPRECISE;CIPOS=-51,52;CIEND=-51,52;SOMATIC;SOMATICSCORE=10 PR:SR 21,0:10,0 43,4:15,3 chr1 60567906 test3 T <DEL> . MinSomaticScore END=60675940;SVTYPE=DEL;SVLEN=-108034;CIPOS=-44,44;CIEND=-38,39;SOMATIC;SOMATICSCORE=18 PR 23,0 44,6 """ expected_b = StringIO(HEADER + body_expected) obj_expected = viola.read_vcf(expected_b) obj_result = self.obj.filter_by_id(['test1', 'test3']) viola.testing.assert_vcf_equal(obj_expected, obj_result)
class TestReadVcfLumpy: vcf_delly = viola.read_vcf(delly_body_buf, variant_caller='delly') def test_read_vcf_delly(self): vcf = self.vcf_delly # assert positions table equal df_pos_expected = pd.read_csv( StringIO( """id,chrom1,pos1,chrom2,pos2,strand1,strand2,ref,alt,qual,svtype DEL00001,chr10,3485568,chr10,3485618,+,-,C,<DEL>,180,DEL INV00001,chr10,8825910,chr10,32816609,-,-,A,<INV>,17,INV DUP00001,chr10,11350704,chr10,122603091,-,+,G,<DUP>,29,DUP INV00002,chr10,61016659,chr10,111424580,+,+,G,<INV>,10,INV BND00001,chr11,15249914,chr10,72297333,+,+,A,A]chr10:72297333],193,BND """)) pd.testing.assert_frame_equal(vcf.positions, df_pos_expected)
class TestToBedpe: #manta_path = os.path.join(HERE, 'data/manta1.inv.vcf') #result = viola.read_vcf(manta_path) body = """chr1 82550461 test1 G <DEL> . MinSomaticScore END=82554225;SVTYPE=DEL;SVLEN=-3764;IMPRECISE;CIPOS=-51,52;CIEND=-51,52;SOMATIC;SOMATICSCORE=10 PR:SR 21,0:10,0 43,4:15,3 chr1 22814216 test2 T <INV> . MinSomaticScore END=92581131;SVTYPE=INV;SVLEN=69766915;IMPRECISE;CIPOS=-51,51;CIEND=-89,90;SOMATIC;SOMATICSCORE=11;INV5 PR 24,0 35,5 chr1 60567906 test3 T <DEL> . MinSomaticScore END=60675940;SVTYPE=DEL;SVLEN=-108034;CIPOS=-44,44;CIEND=-38,39;SOMATIC;SOMATICSCORE=18 PR 23,0 44,6 chr1 69583190 test4 T <DEL> . PASS END=69590947;SVTYPE=DEL;SVLEN=-7757;IMPRECISE;CIPOS=-123,123;CIEND=-135,136;SOMATIC;SOMATICSCORE=47 PR 21,0 20,12 """ b = StringIO(HEADER + body) result = viola.read_vcf(b) def test_to_bedpe_like(self): self.result.to_bedpe(os.path.join(HERE, 'data/out.bedpe')) assert filecmp.cmp(os.path.join(HERE, 'data/out.bedpe'), os.path.join(HERE, 'data/expected.bedpe')) def test_to_bedpe_like_with_info(self): self.result.to_bedpe(os.path.join(HERE, 'data/out.svlen.bedpe'), custom_infonames=['svlen']) assert filecmp.cmp(os.path.join(HERE, 'data/out.svlen.bedpe'), os.path.join(HERE, 'data/expected.svlen.bedpe'))
import viola import numpy as np import os HERE = os.path.abspath(os.path.dirname(__file__)) delly = viola.read_vcf(os.path.join(HERE, 'data', "test.merge.delly.vcf"), variant_caller="delly") manta = viola.read_vcf(os.path.join(HERE, 'data', "test.merge.manta.vcf"), variant_caller="manta") gridss = viola.read_vcf(os.path.join(HERE, 'data', "test.merge.gridss.vcf"), variant_caller="gridss") lumpy = viola.read_vcf(os.path.join(HERE, 'data', "test.merge.lumpy.vcf"), variant_caller="lumpy") def test_merge_to_vcf_like(): merged = viola.merge([manta, gridss, delly, lumpy], integration=True) merged = merged.filter('supportingcallercount > 1') merged.to_vcf(os.path.join(HERE, 'data/output_merged.vcf'))
##FILTER=<ID=MaxMQ0Frac,Description="For a small variant (<1000 bases) in the normal sample, the fraction of reads with MAPQ0 around either breakend exceeds 0.4"> ##ALT=<ID=INV,Description="Inversion"> ##ALT=<ID=DEL,Description="Deletion"> ##ALT=<ID=INS,Description="Insertion"> ##ALT=<ID=DUP:TANDEM,Description="Tandem Duplication"> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT mouse1_N mouse1_T """ #manta_path = os.path.join(HERE, 'data/manta1.inv.vcf') #result = viola.read_vcf(manta_path) body = """chr1 82550461 test1 G <DEL> . MinSomaticScore END=82554225;SVTYPE=DEL;SVLEN=-3764;IMPRECISE;CIPOS=-51,52;CIEND=-51,52;SOMATIC;SOMATICSCORE=10 PR:SR 21,0:10,0 43,4:15,3 chr1 22814216 test2 T <INV> . MinSomaticScore END=92581131;SVTYPE=INV;SVLEN=69766915;IMPRECISE;CIPOS=-51,51;CIEND=-89,90;SOMATIC;SOMATICSCORE=11;INV5 PR 24,0 35,5 chr1 60567906 test3 T <DEL> . MinSomaticScore END=60675940;SVTYPE=DEL;SVLEN=-108034;CIPOS=-44,44;CIEND=-38,39;SOMATIC;SOMATICSCORE=18 PR 23,0 44,6 chr1 69583190 test4 T <DEL> . PASS END=69590947;SVTYPE=DEL;SVLEN=-7757;IMPRECISE;CIPOS=-123,123;CIEND=-135,136;SOMATIC;SOMATICSCORE=47 PR 21,0 20,12 """ b = StringIO(HEADER + body) result = viola.read_vcf(b) def test_to_bedpe_like(): expected_data = """chrom1\tstart1\tend1\tchrom2\tstart2\tend2\tname\tscore\tstrand1\tstrand2 chr1\t82550460\t82550461\tchr1\t82554225\t82554226\ttest1\t\t+\t- chr1\t22814216\t22814217\tchr1\t92581131\t92581132\ttest2\t\t-\t- chr1\t60567905\t60567906\tchr1\t60675940\t60675941\ttest3\t\t+\t- chr1\t69583189\t69583190\tchr1\t69590947\t69590948\ttest4\t\t+\t- """ df_expected = pd.read_csv(StringIO(expected_data), sep='\t') df_expected['score'] = df_expected['score'].astype( object) # because score field is empty in this case bedpe = result.to_bedpe_like() pd.testing.assert_frame_equal(bedpe, df_expected, check_exact=True)
def test_breakend2breakpoint(): vcf = viola.read_vcf(StringIO(HEADER + body)) vcf_expected = viola.read_vcf(StringIO(HEADER_expected + body_expected)) vcf_result = vcf.breakend2breakpoint() assert_vcf_equal(vcf_result, vcf_expected)
def test_remove_info_table(): vcf = viola.read_vcf(os.path.join(HERE, 'data/test.info.added.manta.vcf')) vcf_expected = viola.read_vcf(os.path.join(HERE, 'data/test.manta.vcf')) vcf.remove_info_table('test') assert_vcf_equal(vcf, vcf_expected)
def test_read_vcf_empty_lumpy(): vcf = viola.read_vcf(StringIO(GRIDSS), variant_caller='gridss')
import viola import numpy as np delly = viola.read_vcf("/shared_data/share/merge/test.merge.delly.vcf", variant_caller="delly") gridss = viola.read_vcf("/shared_data/share/merge/test.merge.gridss.vcf", variant_caller="gridss") lumpy = viola.read_vcf("/shared_data/share/merge/test.merge.lumpy.vcf", variant_caller="lumpy") manta = viola.read_vcf("/shared_data/share/merge/test.merge.manta.vcf", variant_caller="manta") def test_merge(): merged_all = gridss.merge(ls_vcf=[delly, lumpy, manta, gridss], threshold=100) viola.testing.assert_vcf_equal(merged_all, merged_all.copy())
import os import shutil import viola vcf_in = str(snakemake.input) vcf_org = vcf_in + '.org' vcf_out = str(snakemake.output) caller = str(snakemake.wildcards.prefix) if caller == 'gridss': os.rename(vcf_in, vcf_org) with open(vcf_in, 'w') as new: with open(vcf_org, 'r') as org: for line in org: # FIX INFO field: change PARID to MATEID new.write(line.replace('PARID', 'MATEID')) try: sv = viola.read_vcf(vcf_in, variant_caller=caller).breakend2breakpoint() sv.to_vcf(vcf_out) except Exception: shutil.copyfile(vcf_in, vcf_out)
def test_single(): vcf = viola.read_vcf(StringIO(HEADER + body), variant_caller='gridss') vcf_single = vcf.idx['single_test'] print(vcf_single.positions)