Esempio n. 1
0
    def __init__(self, filename, force=False, **kwargs):
        """.. rubric:: constructor

        :param filename:
        :param force: even though the file format is not recognised,
            you can force the instanciation. Then, you can use your own
            filters.



        """
        vcf = VCFBase(filename, verbose=False, **kwargs)

        if vcf.version == "4.1":
            logger.info("Reading VCF v 4.1")
            self.vcf = VCF_mpileup_4dot1(filename, **kwargs)
        elif vcf.version == "4.2" and vcf.source.startswith("freeBayes"):
            logger.info("Reading VCF v 4.2 (freebayes)")
            from sequana.freebayes_vcf_filter import VCF_freebayes
            self.vcf = VCF_freebayes(filename, **kwargs)
        else:
            print(vcf.version)
            print(vcf.source)
            msg = """This VCF file is not recognised. So far we handle version
v4.1 with mpileup and v4.2 with freebayes. You may use the force option but not
all filters will be recognised"""
            if force is True:
                print("VCF version %s not tested" % vcf.version)
                self.vcf = vcf
            else:
                raise ValueError(msg)
Esempio n. 2
0
 def check(self):
     from sequana.freebayes_vcf_filter import VCF_freebayes, Variant
     vcf = VCF_freebayes(
         self.wk +
         "/report_vc_Hm2_GTGAAA_L005/outputs/Hm2_GTGAAA_L005.raw.vcf")
     vcf.rewind()
     vv = [Variant(v)._resume for v in vcf]
     event276 = {
         'alternative': 'G',
         'chr': 'ENA|K01711|K01711.1',
         'depth': 5,
         'freebayes_score': 126.901,
         'frequency': '1.00',
         'position': '276',
         'reference': 'C',
         'strand_balance': '0.40'
     }
     event = [v for v in vv if v['position'] == "276"][0]
     assert len(vv) in (85, 93
                        )  # 85 in freebayes 1.0 and 93 in freebayes 1.2
     for k in [
             "depth", "chr", "frequency", "position", "reference",
             "alternative", "strand_balance"
     ]:
         assert event[k] == event276[k]
def test_to_csv():
    filter_dict = {'freebayes_score': 200, 'frequency': 0.85, 'min_depth': 10,
                   'forward_depth': 3, 'reverse_depth': 3, 'strand_ratio': 0.3}
    v = VCF_freebayes(sequana_data('JB409847.expected.vcf'))
    filter_v = v.filter_vcf(filter_dict)
    with TempFile(suffix='.csv') as ft:
        filter_v.to_csv(ft.name)
def test_vcf_filter():
    vcf_output_expected = sequana_data('JB409847.expected.vcf')
    v = VCF_freebayes(sequana_data('JB409847.vcf'))
    filter_dict = {'freebayes_score': 200, 'frequency': 0.85, 'min_depth': 10,
                   'forward_depth': 3, 'reverse_depth': 3, 'strand_ratio': 0.3}
    filter_v = v.filter_vcf(filter_dict)
    with TempFile(suffix='.vcf') as ft:
        filter_v.to_vcf(ft.name)
        compare_file = filecmp.cmp(ft.name, vcf_output_expected)
        assert compare_file
Esempio n. 5
0
def test_to_csv():
    filter_dict = {
        'freebayes_score': 200,
        'frequency': 0.85,
        'min_depth': 10,
        'forward_depth': 3,
        'reverse_depth': 3,
        'strand_ratio': 0.3
    }
    v = VCF_freebayes(sequana_data('JB409847.expected.vcf'))
    filter_v = v.filter_vcf(filter_dict)
    with TempFile(suffix='.csv') as ft:
        filter_v.to_csv(ft.name)
Esempio n. 6
0
def test_vcf_filter():
    vcf_output_expected = sequana_data('JB409847.expected.vcf')
    v = VCF_freebayes(sequana_data('JB409847.vcf'))
    filter_dict = {
        'freebayes_score': 200,
        'frequency': 0.85,
        'min_depth': 10,
        'forward_depth': 3,
        'reverse_depth': 3,
        'strand_ratio': 0.3
    }
    filter_v = v.filter_vcf(filter_dict)
    with TempFile(suffix='.vcf') as ft:
        filter_v.to_vcf(ft.name)
        compare_file = filecmp.cmp(ft.name, vcf_output_expected)
        assert compare_file
Esempio n. 7
0
 def check(self):
     from sequana.freebayes_vcf_filter import VCF_freebayes, Variant
     vcf = VCF_freebayes(
         self.wk +
         "/report_vc_Hm2_GTGAAA_L005/outputs/Hm2_GTGAAA_L005.raw.vcf")
     vcf.rewind()
     vv = [Variant(v)._resume for v in vcf]
     assert len(vv) == 85
     vv[29] == {
         'alternative': 'G',
         'chr': 'ENA|K01711|K01711.1',
         'depth': 5,
         'freebayes_score': 126.901,
         'frequency': '1.00',
         'position': '276',
         'reference': 'C',
         'strand_balance': '0.40'
     }
Esempio n. 8
0
    def __init__(self, bases, freebayes=None):
        self.filename_bases = bases
        self.min_depth = 10
        self.min_score = 1

        if freebayes is not None:
            v = VCF_freebayes(freebayes)
            self.variants = [Variant(x) for x in v if x]
        else:
            self.variants = []
Esempio n. 9
0
 def check(self):
     from sequana.freebayes_vcf_filter import VCF_freebayes, Variant
     vcf = VCF_freebayes(self.wk +
         "/report_vc_Hm2_GTGAAA_L005/outputs/Hm2_GTGAAA_L005.raw.vcf")
     vcf.rewind()
     vv = [Variant(v)._resume for v in vcf]
     event276 = {'alternative': 'G',
             'chr': 'ENA|K01711|K01711.1',
             'depth': 5,
             'freebayes_score': 126.901,
             'frequency': '1.00',
             'position': '276',
             'reference': 'C',
             'strand_balance': '0.40'}
     event = [v for v in vv if v['position'] =="276"][0]
     assert len(vv) in (85,93) # 85 in freebayes 1.0 and 93 in freebayes 1.2
     for k in ["depth", "chr", "frequency", "position", "reference",
               "alternative", "strand_balance"]:
         assert event[k] == event276[k]
Esempio n. 10
0
class VCF(object):
    """A factory to read and filter VCF files for different formats


    VCF provides a way of storing variants. However the formats is very flexible
    and leads to different versions (e.g. 4.1, 4.2) and can be generated by
    different tools (e.g. mpileup, freebayes) leading to heterogeneous VCF
    files.

    VCF files have header, a list of INFO (here below only one called DP)
    and a list of FORMATS (here GT, GQ, GL).

        ##fileformat=VCFv4.1
        ##samtoolsVersion=0.1.19-44428cd
        ##reference=file:///Vibrio_cholerae_O1_biovar_eltor_str_N16961_v2.fasta
        ##contig=<ID=AE003852,length=2961182>
        ##contig=<ID=AE003853,length=1072319>
        ##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
        ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
        ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
        ##FORMAT=<ID=GL,Number=3,Type=Float,Description="Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)">
        ##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
        #CHROM    POS    ID REF ALT    QUAL   FILTER    INFO    FORMAT    data.bam
        AE003852  5414    .   G   A   222          .  DP=179  GT:PL:GQ    1/1:255,255,0:99
        AE003852  20799   .   T   G    17.1        .  DP=172  GT:PL:GQ    0/1:47,0,255:50

    You can apply filter based on the INFO

    """
    def __init__(self, filename, force=False, **kwargs):
        """.. rubric:: constructor

        :param filename:
        :param force: even though the file format is not recognised,
            you can force the instanciation. Then, you can use your own
            filters.



        """
        vcf = VCFBase(filename, verbose=False, **kwargs)

        if vcf.version == "4.1":
            logger.info("Reading VCF v 4.1")
            self.vcf = VCF_mpileup_4dot1(filename, **kwargs)
        elif vcf.version == "4.2" and vcf.source.startswith("freeBayes"):
            logger.info("Reading VCF v 4.2 (freebayes)")
            from sequana.freebayes_vcf_filter import VCF_freebayes
            self.vcf = VCF_freebayes(filename, **kwargs)
        else:
            print(vcf.version)
            print(vcf.source)
            msg = """This VCF file is not recognised. So far we handle version
v4.1 with mpileup and v4.2 with freebayes. You may use the force option but not
all filters will be recognised"""
            if force is True:
                print("VCF version %s not tested" % vcf.version)
                self.vcf = vcf
            else:
                raise ValueError(msg)

    def hist_qual(self, fontsize=16, bins=100):
        """

        This uses the QUAL information to be found in the VCF and should
        work for all VCF with version 4.1 (at least)

        """
        # TODO: could be moved to VCFBase
        self.vcf.rewind()
        data = [x.QUAL for x in self.vcf]
        pylab.hist(data, bins=bins)
        pylab.grid(True)
        pylab.xlabel("Variant quality", fontsize=fontsize)
Esempio n. 11
0
def test_constructor():
    try:
        VCF_freebayes('dummy')
        assert False
    except FileNotFoundError:
        assert True