def __init__(self, filename, force=False, **kwargs): """.. rubric:: constructor :param filename: :param force: even though the file format is not recognised, you can force the instanciation. Then, you can use your own filters. """ vcf = VCFBase(filename, verbose=False, **kwargs) if vcf.version == "4.1": logger.info("Reading VCF v 4.1") self.vcf = VCF_mpileup_4dot1(filename, **kwargs) elif vcf.version == "4.2" and vcf.source.startswith("freeBayes"): logger.info("Reading VCF v 4.2 (freebayes)") from sequana.freebayes_vcf_filter import VCF_freebayes self.vcf = VCF_freebayes(filename, **kwargs) else: print(vcf.version) print(vcf.source) msg = """This VCF file is not recognised. So far we handle version v4.1 with mpileup and v4.2 with freebayes. You may use the force option but not all filters will be recognised""" if force is True: print("VCF version %s not tested" % vcf.version) self.vcf = vcf else: raise ValueError(msg)
def check(self): from sequana.freebayes_vcf_filter import VCF_freebayes, Variant vcf = VCF_freebayes( self.wk + "/report_vc_Hm2_GTGAAA_L005/outputs/Hm2_GTGAAA_L005.raw.vcf") vcf.rewind() vv = [Variant(v)._resume for v in vcf] event276 = { 'alternative': 'G', 'chr': 'ENA|K01711|K01711.1', 'depth': 5, 'freebayes_score': 126.901, 'frequency': '1.00', 'position': '276', 'reference': 'C', 'strand_balance': '0.40' } event = [v for v in vv if v['position'] == "276"][0] assert len(vv) in (85, 93 ) # 85 in freebayes 1.0 and 93 in freebayes 1.2 for k in [ "depth", "chr", "frequency", "position", "reference", "alternative", "strand_balance" ]: assert event[k] == event276[k]
def test_to_csv(): filter_dict = {'freebayes_score': 200, 'frequency': 0.85, 'min_depth': 10, 'forward_depth': 3, 'reverse_depth': 3, 'strand_ratio': 0.3} v = VCF_freebayes(sequana_data('JB409847.expected.vcf')) filter_v = v.filter_vcf(filter_dict) with TempFile(suffix='.csv') as ft: filter_v.to_csv(ft.name)
def test_vcf_filter(): vcf_output_expected = sequana_data('JB409847.expected.vcf') v = VCF_freebayes(sequana_data('JB409847.vcf')) filter_dict = {'freebayes_score': 200, 'frequency': 0.85, 'min_depth': 10, 'forward_depth': 3, 'reverse_depth': 3, 'strand_ratio': 0.3} filter_v = v.filter_vcf(filter_dict) with TempFile(suffix='.vcf') as ft: filter_v.to_vcf(ft.name) compare_file = filecmp.cmp(ft.name, vcf_output_expected) assert compare_file
def test_to_csv(): filter_dict = { 'freebayes_score': 200, 'frequency': 0.85, 'min_depth': 10, 'forward_depth': 3, 'reverse_depth': 3, 'strand_ratio': 0.3 } v = VCF_freebayes(sequana_data('JB409847.expected.vcf')) filter_v = v.filter_vcf(filter_dict) with TempFile(suffix='.csv') as ft: filter_v.to_csv(ft.name)
def test_vcf_filter(): vcf_output_expected = sequana_data('JB409847.expected.vcf') v = VCF_freebayes(sequana_data('JB409847.vcf')) filter_dict = { 'freebayes_score': 200, 'frequency': 0.85, 'min_depth': 10, 'forward_depth': 3, 'reverse_depth': 3, 'strand_ratio': 0.3 } filter_v = v.filter_vcf(filter_dict) with TempFile(suffix='.vcf') as ft: filter_v.to_vcf(ft.name) compare_file = filecmp.cmp(ft.name, vcf_output_expected) assert compare_file
def check(self): from sequana.freebayes_vcf_filter import VCF_freebayes, Variant vcf = VCF_freebayes( self.wk + "/report_vc_Hm2_GTGAAA_L005/outputs/Hm2_GTGAAA_L005.raw.vcf") vcf.rewind() vv = [Variant(v)._resume for v in vcf] assert len(vv) == 85 vv[29] == { 'alternative': 'G', 'chr': 'ENA|K01711|K01711.1', 'depth': 5, 'freebayes_score': 126.901, 'frequency': '1.00', 'position': '276', 'reference': 'C', 'strand_balance': '0.40' }
def __init__(self, bases, freebayes=None): self.filename_bases = bases self.min_depth = 10 self.min_score = 1 if freebayes is not None: v = VCF_freebayes(freebayes) self.variants = [Variant(x) for x in v if x] else: self.variants = []
def check(self): from sequana.freebayes_vcf_filter import VCF_freebayes, Variant vcf = VCF_freebayes(self.wk + "/report_vc_Hm2_GTGAAA_L005/outputs/Hm2_GTGAAA_L005.raw.vcf") vcf.rewind() vv = [Variant(v)._resume for v in vcf] event276 = {'alternative': 'G', 'chr': 'ENA|K01711|K01711.1', 'depth': 5, 'freebayes_score': 126.901, 'frequency': '1.00', 'position': '276', 'reference': 'C', 'strand_balance': '0.40'} event = [v for v in vv if v['position'] =="276"][0] assert len(vv) in (85,93) # 85 in freebayes 1.0 and 93 in freebayes 1.2 for k in ["depth", "chr", "frequency", "position", "reference", "alternative", "strand_balance"]: assert event[k] == event276[k]
class VCF(object): """A factory to read and filter VCF files for different formats VCF provides a way of storing variants. However the formats is very flexible and leads to different versions (e.g. 4.1, 4.2) and can be generated by different tools (e.g. mpileup, freebayes) leading to heterogeneous VCF files. VCF files have header, a list of INFO (here below only one called DP) and a list of FORMATS (here GT, GQ, GL). ##fileformat=VCFv4.1 ##samtoolsVersion=0.1.19-44428cd ##reference=file:///Vibrio_cholerae_O1_biovar_eltor_str_N16961_v2.fasta ##contig=<ID=AE003852,length=2961182> ##contig=<ID=AE003853,length=1072319> ##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth"> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> ##FORMAT=<ID=GL,Number=3,Type=Float,Description="Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)"> ##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods"> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT data.bam AE003852 5414 . G A 222 . DP=179 GT:PL:GQ 1/1:255,255,0:99 AE003852 20799 . T G 17.1 . DP=172 GT:PL:GQ 0/1:47,0,255:50 You can apply filter based on the INFO """ def __init__(self, filename, force=False, **kwargs): """.. rubric:: constructor :param filename: :param force: even though the file format is not recognised, you can force the instanciation. Then, you can use your own filters. """ vcf = VCFBase(filename, verbose=False, **kwargs) if vcf.version == "4.1": logger.info("Reading VCF v 4.1") self.vcf = VCF_mpileup_4dot1(filename, **kwargs) elif vcf.version == "4.2" and vcf.source.startswith("freeBayes"): logger.info("Reading VCF v 4.2 (freebayes)") from sequana.freebayes_vcf_filter import VCF_freebayes self.vcf = VCF_freebayes(filename, **kwargs) else: print(vcf.version) print(vcf.source) msg = """This VCF file is not recognised. So far we handle version v4.1 with mpileup and v4.2 with freebayes. You may use the force option but not all filters will be recognised""" if force is True: print("VCF version %s not tested" % vcf.version) self.vcf = vcf else: raise ValueError(msg) def hist_qual(self, fontsize=16, bins=100): """ This uses the QUAL information to be found in the VCF and should work for all VCF with version 4.1 (at least) """ # TODO: could be moved to VCFBase self.vcf.rewind() data = [x.QUAL for x in self.vcf] pylab.hist(data, bins=bins) pylab.grid(True) pylab.xlabel("Variant quality", fontsize=fontsize)
def test_constructor(): try: VCF_freebayes('dummy') assert False except FileNotFoundError: assert True