Beispiel #1
0
    def select(self, jar, outvcf, ref, snp=False, pass_only=False,
               indel=False):
        """ parse VCF to get only sites passed quality control

        Parameters
        ----------
        jar: str
            GATK jar path
        prefix: str
            output file prefix
        outvcf:
            output vcf

        """
        if snp and indel:
            raise ValueError("Cannot select both SNPs and InDels")
        if not any([snp, pass_only, indel]):
            raise ValueError("At least select one type of variants")

        cmd = ' '.join([
            'java -jar ', jar, '-T SelectVariants', '-R', ref, '-V', self._vcf,
            '-o', outvcf
        ])
        cmd += ' -ef' if pass_only else ''
        cmd += ' -selectType SNP' if snp else ''
        cmd += ' -selectType INDEL' if indel else ''
        run(cmd)
        return self
Beispiel #2
0
 def cal_miss(self, name='miss'):
     """ Calculate missingness of all sites"""
     # TO DO
     self.get_plink()
     run('plink --bfile '+self._plink+' --missing --allow-extra-chr --out '
         +self._prefix)
     return self
Beispiel #3
0
    def import_vcf(self, info=['AF', 'AN', 'AC']):
        """ Import info from a VCF

        Description
        -----------
        get vcf and AF and missingness from a VCF
        Caveat: This module assumes the VCF's coming from GATK, with AF as the
        field for allele frequencies, and AC for Allele Count, and AN for
        Allelic Number.

        Parameters
        ----------
        VCF: str
            input VCF file path

        info: list
            A list that contains names of infor field of interest

        """
        header = ['CHR', 'ID'] + info
        query_string = '\'%CHROM\t%CHROM-%POS-%REF-%ALT{0}\t'
        query_string += '\t'.join([('%'+i) for i in info])+'\''

        cmd = ' '.join([
            "bcftools query -f ", query_string, self._vcf, '>', self._tsv])
        run(cmd)
        self._df = pd.read_csv(out_tsv, sep='\t', header=None, names=header)
        return self
Beispiel #4
0
 def get_plink(self):
     """ Get plink format files """
     cmd = ' '.join('plink --vcf', self._vcf, '--allow-extra-chr', '--out',
     self._prefix)
     run(cmd)
     self._plink = self._prefix
     return self
Beispiel #5
0
def pilon(fa, bam, prefix, ram, threads, jar):
    """ Run pilon commands

    Parameters
    ----------
        fa: :obj:`str` fasta file
        bam: :obj:`str` input bam path
        prefix: :obj:`str` output prefix
        ram: :obj:`int` input ram
        threads: :obj:`int` threads for pilon
        outdir: :obj:`str` output directory

    Returns
    -------


    """
    cmd = ' '.join([
        'java -Xmx'+str(ram)+'g',
        '-jar', jar,
        '--genome', fa,
        '--frags', bam,
        '--output', prefix,
        '--threads', str(threads),
        '--vcf --changes --tracks --verbose > '+prefix+'.pilon.log 2>&1'])
    run(cmd)
    return cmd
Beispiel #6
0
 def combine_var(self, vcf_dict, option, priority=None):
     '''
     :param vcf_dict: dictionary of vcf files, with key abbreviation of
                      each vcf
     :param prefix: output prefix
     :param option: merging options
     :param priority:
     '''
     out_vcf = self.prefix + '.vcf.gz'
     options = ['UNIQUIFY', 'PRIORITIZE', 'UNSORTED']
     if option not in options:
         raise ValueError('Merge option not valid.\n')
     if option == 'PRIORITIZE' and priority is None:
         raise ValueError('Need to specify priority.\n')
     if option == 'UNSORTED':
         option += ' --assumeIdenticalSamples'
     cmd = ' '.join([
         self.cmd, '-T CombineVariants', '-genotypeMergeOptions', option,
         '-O', out_vcf
     ])
     for name, vcf in vcf_dict.items():
         if option == 'PRIORITIZE':
             cmd += ' --variant:' + name + ' ' + vcf
         else:
             cmd += ' --variant ' + vcf
     run(cmd)
     return out_vcf
Beispiel #7
0
def fa2phylip(fa, output, jar):
    ''' transfer fasta file to phylip with java tool readSeq
    :param fa: fasta file
    :param jar: path to readseq.jar
    :param out_prefix:
    '''
    cmd = ' '.join(['java -cp', jar, 'run -f 12', fa])
    run(cmd)
    return output
Beispiel #8
0
def process_pilon_out(log, outdir, prefix):
    """ process pilon output
        log: logfile
        outdir: output directory
    """
    cmd = ' '.join(
         ['pilon_metrics', '-d', outdir, '-l', log, '--out_prefix', prefix])
    run(cmd)
    return cmd
Beispiel #9
0
def FastTreeDP(in_fa, out_prefix):
    ''' perform fastaTreeDP analysis
    :param in_fa: input fasta file
    :param out_prefix: output file prefix
    :returns nwk file
    '''
    out_nwk = out_prefix+'.nwk'
    cmd = 'FastTreeDP -nt '+in_fa+' > ' + out_nwk
    run(cmd)
    return out_nwk
Beispiel #10
0
def fasttree(fa, prefix):
    ''' Run FastTreeDP
    :param fa: fasta file
    :param prefix: output prefix
    '''
    cmd = ' '.join([
        'FastTreeDP -nt', fa, '>', out_prefix+'.nwk'
    ])
    run(cmd)
    return prefix+'.nwk'
Beispiel #11
0
def tabix(file, type=None):
    """ Index tabix file
    :param file: input file
    :param type: file type, vcf
    """
    cmd = 'tabix '+file
    if type:
        cmd += ' -p '+type
    run(cmd)
    return file+'.tbi'
Beispiel #12
0
 def cal_dos(self, haploid=True):
     """ Get a genotype dosage matrix from the VCF """
     dos_file = self._prefix+'.dos.tsv'
     if haploid:
         run("bcftools query -f '[%GT ]\\n' " + self._vcf + '>' + dos_file)
         self._dosage_matrix = pd.read_csv(
             dos_file, sep=r'\s+', header=None, na_values='.')
     else:
         raise ValueError("Not yet support polyploid.")
     return self
Beispiel #13
0
def ramxl(phylip, output, threads):
    ''' Run RAaML
    :param phylip: input phylip format file
    :param output: output file name
    :param threads: number of threads used for
    '''
    cmd = ' '.join([
        'raxmlHPC-PTHREADS-SSE3 -p 78960 -f a -x 12345 -N 1000 -m GTRCAT',
        '-T', str(threads), '-n', output, '-s', phylip])
    run(cmd)
    return output
Beispiel #14
0
 def get_info(self, info=['AF']):
     """ Get variant site level info of interest """
     header = ['CHR', 'ID'] + info
     query_string = '\'%CHROM\t%CHROM-%POS-%REF-%ALT{0}\t'
     query_string += '\t'.join([('%'+i) for i in info])+'\''
     cmd = ' '.join([
         "bcftools query -f ", query_string, self._vcf, '>',
         self._site_info_tsv])
     run(cmd)
     self._site_info = pd.read_csv(self._site_info_tsv, sep='\t',
                                   header=None, names=header)
Beispiel #15
0
def filterGatkGenotypes(vcf, out_prefix):
    """ filter Gatk output vcf
    :param vcf: input vcf file
    :param out_prefix: output prefix
    """
    outfile = out_prefix+'_GQ50_AD08_DP10.vcf'
    cmd = ' '.join([
        'filterGatkGenotypes.py --min_GQ 50 --min_percent_alt_in_AD 0.8',
        '--min_total_DP 10', vcf, '>', outfile
    ])
    run(cmd)
    return outfile
Beispiel #16
0
 def create_snpeff_db(gff3, dir, genome, config, prefix, ram, jar, ref_fa):
     """ Create snpEff database
     gff3: gff file of gene annotation
     genome: name of the reference genome
     config: snpEff config files
     prefix: output Prefix
     ram: RAM in GB
     jar: snpEff jar
     ref_fa: reference fasta file
     """
     run(' '.join(['snpeff_db.sh', dir, jar, genome, ref_fa, gff3, ram]))
     return cmd
Beispiel #17
0
def vcf_snp_to_fasta(invcf, prefix, max_amb=10):
    ''' snp only vcf to fasta file
    :param invcf: input vcf file
    :param prefix: output file prefix
    :param max_amb: maximum number of samples with ambiguous calls for a site
                    to be included, recommended number of samples 10%, use
                    a very large number to disable this function 100000 (
                    legacy options and will not be maintained.)
    '''
    cmd = ' '.join(['vcfSnpsToFasta.py --max_amb_samples', max_amb, invcf, '>',
                   prefix+'.fasta'])
    run(cmd)
    return prefix+'.fasta'
Beispiel #18
0
def filter_variants(invcf, outvcf, min_GQ=50, AD=0.8, DP=10):
    """ apply variant filtering using GQ, AD and DP
    :param invcf: input vcf
    :param outvcf: output vcf
    :param min_GQ: minimum GQ cutoff
    :param AD: allelic depth cutoff
    :param DP: depth cutoff
    """
    cmd = ' '.join(['filterGatkGenotypes.py', '--min_GQ', str(min_GQ),
                    '--min_percent_alt_in_AD', str(AD),
                    '--min_total_DP', str(DP), invcf, '>', outvcf])
    run(cmd)
    return outvcf
Beispiel #19
0
 def genotype_concordance(self, comp, eval, hap=False):
     ''' comppare
     :param comp: VCF file for comparison
     :parma eval: VCF file for evaluation
     :param out: output evaluation results
     :param hap: whether input is haploid VCF
     '''
     out = self.out_dir + '/' + self.prefix + '.txt'
     cmd = ' '.join([
         self.cmd, '-T GenotypeConcordance', '--comp', comp, '--eval', eval,
         '--out', out
     ])
     run(cmd)
     return out
Beispiel #20
0
def snpeff_db(gff3, dir, genome, config, prefix, ram, jar, ref_fa):
    """ Create snpEff database
    gff3: gff file of gene annotation
    genome: name of the reference genome
    config: snpEff config files
    prefix: output Prefix
    ram: RAM in GB
    jar: snpEff jar
    ref_fa: reference fasta file
    """
    snpeff_dir = os.path.dirname(jar)
    cmd = ' '.join(['sh snpeff_db.sh', dir, snpeff_dir, genome, ref_fa, gff3,
                    ram])
    run(cmd)
    return cmd
Beispiel #21
0
 def snpeff_annot(self, jar, config, genome, ram):
     """ run SNPEFF on a vcf
     invcf: input vcf
     outvcf: output vcf
     jar: snpeff jar
     genome: tag of genome name
     ram: memory in GB
     config: configuration file
     """
     self.ann_vcf = os.path.basename(self._vcf).replace('vcf', 'snpeff.vcf')
     run(' '.join([
         'java -Xmx'+str(ram)+'g', '-jar', jar, 'ann', '-v',
         '-c', config, '-i vcf -o vcf', genome,
         self._vcf, '| bgzip >', self.ann_vcf]))
     return self
Beispiel #22
0
 def select_var(self, in_vcf, xl=None, il=None):
     ''' select variants
     :param in_vcf: input vcf
     :param xl: intervals to exclude
     :param il: intervals to include
     '''
     output = self.prefix + '.vcf.gz'
     cmd = ' '.join([
         self.cmd, '-T SelectVariants', '--variant', in_vcf, '-o ', output
     ])
     if xl is not None:
         cmd += '-XL ' + xl
     if il is not None:
         cmd += '-L ' + il
     run(cmd)
     return output
def coverage_barplot(cov_tsv, prefix, color_csv, legacy, no_sub):
    """ Generate coverage barplot
    :param cov_tsv: coverage profile list, first
    :param prefix: output prefix
    :param legacy: if output tsv in legacy mode, compatible with matlab code
    :param no_sub: boolean, whether input genome has a subgenome
    """
    cmd = ' '.join(['coverage_barplot.R', '-i', cov_tsv, '-p', prefix,
                    '-c', color_csv])
    if legacy:
        cmd += ' -l'
    if no_sub:
        cmd += ' --nosub'
    run(cmd)
    print(' - Finish generating coverage plot.')
    return 1
Beispiel #24
0
def snpeff(invcf, outvcf, jar, config, genome, ram):
    """ run SNPEFF on a vcf
    invcf: input vcf
    outvcf: output vcf
    jar: snpeff jar
    genome: tag of genome name
    ram: memory in GB
    config: configuration file
    """
    cmd = ' '.join([
        'java -Xmx'+str(ram)+'g',
        '-jar', jar,
        'eff', '-v',
        '-c', config,
        '-onlyCoding False',
        '-i vcf -o vcf', genome, invcf, '>', outvcf])
    run(cmd)
    return cmd
Beispiel #25
0
 def variant_eval(self, vcf, titv=True, samp=True, indel=True, multi=True):
     ''' VCF sample QC by different stratifications
     :param vcf: input vcf
     :param titv: use TiTv Evaluator
     :param indel: use InDel Evaluator
     :param multi: summarize multiallelic sites
     :param samp: stratify by samples
     '''
     out = os.path.join(self.out_dir, self.prefix + '.eval')
     cmd = ' '.join([
         self.cmd, '-T VariantEval', '--eval', vcf, '-o', out,
         '-noEV -noST -EV CountVariants'
     ])
     if titv:
         cmd += ' -EV TiTvVariantEvaluator'
     if samp:
         cmd += ' -ST Sample'
     if indel:
         cmd += ' -EV IndelSummary'
     if multi:
         cmd += ' -EV MultiallelicSummary'
     run(cmd)
     return out
Beispiel #26
0
 def import_snpeff(self, snpeff_tsv=None):
     if snpeff_tsv is None:
         info_fields = [
             'AF', 'AN', 'AC',
             'SNPEFF_AMINO_ACID_CHANGE',
             'SNPEFF_CODON_CHANGE',
             'SNPEFF_EFFECT',
             'SNPEFF_EXON_ID',
             'SNPEFF_FUNCTIONAL_CLASS',
             'SNPEFF_GENE_BIOTYPE',
             'SNPEFF_GENE_NAME',
             'SNPEFF_IMPACT',
             'SNPEFF_TRANSCRIPT_ID'
         ]
         snpeff_tsv = self._prefix+'.snpeff.tsv'
         query = ('\'%CHROM\t%POS\t%REF\t%ALT\t'
                  + '\t'.join(['%INFO/'+i for i in info_fields])
                  + '\n\'')
         run('bcftools query -f {} '.format(query)
             + self._vcf+'> '+snpeff_tsv)
     self._site_info = pd.read_csv(
         snpeff_tsv, sep='\t', header=None,
         names=['CHR', 'POS', 'REF', 'ALT']+info_fields)
     return self
Beispiel #27
0
 def af():
     """ get allele frequencies using vcftools """
     run("vcftools --gzvcf "+self._vcf + " --freq2 --out tmp")
     self._af = pd.read_csv('tmp.frq', sep='\t', header=0)
     rm('tmp.frq')
     return self