def ligate_shapeitchunks(self, vcf_f, scaffolded_samples, chunk_str, output_prefix, verbose=False): ''' Run ligateHAPLOTYPES to ligate together all haplotype chunks produced by SHAPEIT (see https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html#haplegsample) Parameters ---------- vcf_f : filename VCF file with the Genotype likelihoods scaffolded_samples : filename File with the list of samples (separated by '\n') that have been scaffolded chunk_str : str String with the paths to the different files generated by SHAPEIT for the different chromosome chunks (i.e. 's2.chunk1.hap.gz s2.chunk1.hap.gz s2.chunk1.hap.gz') output_prefix : str String with the output prefixes (i.e. 'output.shapeit.22.ligated.haps.gz output.shapeit.22.ligated.haps.sample') Returns ------- dict A dict with the path to the 2 output files (*.haps.gz and *.haps.sample) ''' if self.ligateHAPLOTYPES_folder is None: raise Exception("ligateHAPLOTYPES_folder must be defined") Arg = namedtuple('Argument', 'option value') args = [ Arg('--vcf', vcf_f), Arg('--scaffold', scaffolded_samples), Arg('--chunks', chunk_str), Arg( '--output', '{0}.ligated.haps.gz ' '{0}.ligated.haps.sample'.format(output_prefix)) ] runner = RunProgram(path=self.ligateHAPLOTYPES_folder, program='ligateHAPLOTYPES', args=args) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() outdict = { 'hap_gz': '{0}.ligated.haps.gz'.format(output_prefix), 'hap_sample': '{0}.ligated.haps.sample'.format(output_prefix) } return outdict
def drop_info(self, outfile, verbose=False): ''' Function to remove the INFO annotation from a VCF. This function uses bcftools annotate to perform this operation Parameters ---------- outfile : filename File where the output VCF will be written verbose : bool, optional increase the verbosity, default=False Returns ------- filename Path to the vcf.gz file without the INFO annotation ''' Arg = namedtuple('Argument', 'option value') args = [Arg('-o', outfile), Arg('-O', 'z')] runner = RunProgram(path=self.bcftools_folder, program='bcftools annotate --remove INFO', args=args, parameters=[self.vcf]) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() return outfile
def run_vtnormalize(self,outprefix,reference,compress=False, verbose=False, outdir=None, n=False): ''' Run vt normalize on a vcf file Parameters ---------- outprefix : str, required prefix for outputfile reference : str, required path to Fasta file with reference compress : boolean, optional bgzip compress the normalized VCF outdir : str, optional If provided, then put output files in this folder n : bool, optional warns but does not exit when REF is inconsistent with reference sequence for non SNPs. Default=False verbose : bool, optional if true, then increase verbosity Returns ------- A string with path to normalized file ''' if self.vt_folder is None: raise Exception("Provide a vt_folder containing the vt binary") Arg = namedtuple('Argument', 'option value') if outdir: outprefix = "{0}/{1}".format(outdir, outprefix) outprefix = outprefix+".norm.vcf" args=[Arg('-r',reference)] parameters=[self.vcf] if n is True: parameters.append('-n') runner=None pipelist=None if compress is True: outprefix += ".gz" compressRunner=RunProgram(path=self.bgzip_folder,program='bgzip',parameters=[ '-c', '>', outprefix]) pipelist=[compressRunner] elif compress is None or compress is False: args.append(Arg('-o',outprefix)) runner=RunProgram(path=self.vt_folder, program='vt normalize', args=args, parameters=parameters, downpipe=pipelist) if verbose is True: print("Command line for running vt normalize is: {0}".format(runner.cmd_line)) runner.run_checkoutput() return outprefix
def run_CollectVariantCallingMetrics(self, outprefix, truth_vcf, intervals=None, verbose=None): ''' Method to run Picard's CollectVariantCallingMetrics on a VcfQC object. Parameters ---------- outprefix : str Prefix for outfiles: prefix.variant_calling_detail_metrics and prefix.variant_calling_summary_metrics. truth_vcf : str Reference VCF file. intervals : str, optional Target intervals to restrict analysis to. verbose : bool, optional Increase verbosity. Returns ------- CollectVCallingMetrics object ''' if self.picard_folder is None: raise Exception("Provide a picard folder") Arg = namedtuple('Argument', 'option value') args = [ Arg('I', self.vcf), Arg('O', outprefix), Arg('DBSNP', truth_vcf) ] if intervals: args.append(Arg('TI', intervals)) runner = RunProgram( program='java -jar {0}/picard.jar CollectVariantCallingMetrics'. format(self.picard_folder), args=args, arg_sep="=") if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() #create CollectVCallingMetrics object with the output files cvcmetrics = CollectVCallingMetrics(vc_detail_metrics_file=outprefix\ +".variant_calling_detail_metrics", vc_summary_metrics_file=outprefix\ +".variant_calling_summary_metrics") return cvcmetrics
def run_bcftoolsnorm(self, outprefix, reference, multiallelics=None, type=None, outdir=None,verbose=False): ''' Run bcftools norm on a vcf file Parameters ---------- outprefix : str, required prefix for outputfile reference : str, required path to Fasta file with reference multiallelic : str, optional Operate on multiallelic variants and either split or merge them. Possible values are: 'split'/'merge' type: str, optional If 'multiallelic' is defined then operate on this type of variant. Possible values are: snps|indels|both|any outdir : str, optional If provided, then put output files in this folder verbose : bool, optional if true, then increase verbosity Returns ------- A string with path to normalized file ''' if outdir: outprefix = "{0}/{1}".format(outdir, outprefix) outprefix = outprefix+".norm.vcf.gz" Arg = namedtuple('Argument', 'option value') args=[Arg('-f',reference), Arg('-o',outprefix)] if multiallelics == "split": if type is None: raise Exception("'multiallelics' option is defined, so please provide a 'type' value") args.append(Arg('-m',"\'-{0}\'".format(type))) elif multiallelics == "merge": if type is None: raise Exception("'multiallelics' option is defined, so please provide a 'type' value") args.append(Arg('-m',"\'+{0}\'".format(type))) else: if multiallelics is not None: raise Exception("'multiallelics' value is not recognized: {0}".format(multiallelics)) parameters=[self.vcf,'-Oz'] runner=RunProgram(path=self.bcftools_folder, program='bcftools norm', args=args, parameters=parameters) if verbose is True: print("Command line for running bcftools norm is: {0}".format(runner.cmd_line)) runner.run_checkoutput() return outprefix
def prepare_Gen_From_Beagle4(self,prefix_in,outprefix,threshold=0.995,verbose=False): ''' Method that uses prepareGenFromBeagle4 in order to convert the different Beagle chunks generated by 'self.make_beagle_chunks' into a single concatenated output that can be used with SHAPEIT. see https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html#gettingstarted Parameters ---------- prefix_in : str prefix used in the output of the different Beagle chunks after running method 'self.run_beagle'. i.e. output.beagle4.22.*. outprefix : str Prefix used for output files. i.e. If prefix 'input.shapeit.chr22' is used. Then it will generate the following files: input.shapeit.chr22.gen.gz input.shapeit.chr22.gen.sample input.shapeit.chr22.hap.gz input.shapeit.chr22.hap.sample threshold : float, optional Threshold meaning that all genotypes with a posterior above 0.995 are directly fixed and will only need phasing in the SHAPEIT step. Default: 0.995 verbose : bool, optional if true, then print the command line used for running this tool.Default=False Returns ------- dict A dict with the path to the 4 output files (*.gen.* and *.hap.*) that can be used with SHAPEIT ''' if self.prepareGenFromBeagle4_folder is None: raise Exception("Provide the folder for the prepareGenFromBeagle4 binary") posteriors="{0}*.vcf.gz".format(prefix_in) Arg = namedtuple('Argument', 'option value') args=[Arg('--likelihoods',self.vcf), Arg('--posteriors',posteriors), Arg('--output',outprefix)] runner=RunProgram(path="{0}/".format(self.prepareGenFromBeagle4_folder), program='prepareGenFromBeagle4', args=args) if verbose is True: print("Command line for running prepareGenFromBeagle4 is: {0}".format(runner.cmd_line)) runner.run_checkoutput() outdict={ 'gen_gz' :'{0}.gen.gz'.format(outprefix), 'gen_sample' : '{0}.gen.sample'.format(outprefix), 'hap_gz' : '{0}.hap.gz'.format(outprefix), 'hap_sample' : '{0}.hap.sample'.format(outprefix) } return outdict
def select_variants(self, outprefix, uncalled=None, threads=1, verbose=None): ''' Run bcftools view to select only the variants (exclude the 0|0 genotypes) Parameters ---------- outprefix : str Prefix used for the output file uncalled : {'exclude','include'}, optional. Select/Exclude sites with an uncalled genotype threads: int, optional Number of output compression threads to use in addition to main thread. Default=0 verbose : Boolean, optional Increase verbosity Returns ------- filename Returns the path for the VCF with the selected variants ''' outfile = outprefix + ".onlyvariants.vcf.gz" Arg = namedtuple('Argument', 'option value') args = [Arg('-o', outfile), Arg('-O', 'z'), Arg('--threads', threads)] params = [] if uncalled == 'exclude': params.append('-U') elif uncalled == 'include': params.append('-u') params.append(self.vcf) runner = RunProgram(path=self.bcftools_folder, program='bcftools view', args=args, parameters=params) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() return outfile
def make_beagle_chunks(self, window, overlap, outfile, verbose=True): ''' Method to generate the chromosome chunks for Beagle see https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html#gettingstarted Parameters ---------- window : int The chunk size (--window) in number of variant sites overlap : int The overlap size (--overlap) in number of variant sites outfile : filename Output file name. i.e. 'chunk.coordinates' verbose : bool, optional If true, then print the command line used for running this tool.Default=True Returns ------- filename Path to file with the coordinates of the chunk ''' if self.makeBGLCHUNKS_folder is None: raise Exception("Provide the folder for the makeBGLCHUNKS binary") Arg = namedtuple('Argument', 'option value') args = [ Arg('--vcf', self.vcf), Arg('--window', window), Arg('--overlap', overlap), Arg('--output', outfile) ] runner = RunProgram(path="{0}/".format(self.makeBGLCHUNKS_folder), program='makeBGLCHUNKS', args=args) print(runner.cmd_line) if verbose is True: print("Command line for running makeBGLCHUNKS is: {0}".format( runner.cmd_line)) runner.run_checkoutput() return outfile
def run_gatk_VariantsToAllelicPrimitives(self, outprefix, reference, outdir=None, compress=None, verbose=None): ''' Run GATK VariantsToAllelicPrimitives in order to decompose MNPs into more basic/primitive alleles Parameters ---------- outprefix : str, required prefix for outputfiles reference : str, Required Path to fasta file containing the reference outdir : str, optional If provided, then put output files in this folder compress : boolean, optional bgzip compress the normalized VCF verbose : bool, optional if true, then increase verbosity Returns ------- A string with path to decomposed file ''' if self.gatk_folder is None: raise Exception("Error. I need that the folder containing the GATK " "jar file is defined!") if outdir: outprefix = "{0}/{1}".format(outdir, outprefix) outprefix = outprefix+".aprimitives.vcf" Arg = namedtuple('Argument', 'option value') args=[Arg('-T','VariantsToAllelicPrimitives'), Arg('-R',reference), Arg('-V',self.vcf), Arg('-o',outprefix) ] runner=RunProgram(program='java -jar {0}/GenomeAnalysisTK.jar'.format(self.gatk_folder), args=args) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout,stderr=runner.run_popen() if compress is True: compressRunner=RunProgram(path=self.bgzip_folder,program='bgzip',parameters=[ '-c', outprefix, '>', outprefix+".gz"]) compressRunner.run_checkoutput() #delete tmp files os.remove(outprefix) os.remove(outprefix+".idx") outprefix += ".gz" elif compress is False: return outprefix else: raise Exception("'compress' parameter is not valid") return outprefix
def reheader(self, newheader, outprefix, samplefile=None, verbose=False): ''' Modifiy the VCF's header with the newheader Parameters ---------- newheader : string Path to the file containing the new header outprefix : string Prefix for output files samplefile : string, optional Path to the file with the sample names that will included in the new header verbose : bool, optional increase the verbosity, default=False Returns ------- filename Path to the VCF with the modified header ''' outfile = outprefix + ".reheaded.vcf.gz" Arg = namedtuple('Argument', 'option value') args = [Arg('-h', newheader), Arg('-o', outfile)] if samplefile is not None: args.append(Arg('-s', samplefile)) runner = RunProgram(path=self.bcftools_folder, program='bcftools reheader', args=args, parameters=[self.vcf]) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() return outfile
def filter(self, name, expression, verbose=None): ''' Run bcftools filter on a VCF file Parameters ---------- name : str annotate FILTER column with <str> expression : str exclude sites for which expression is true. i.e. 'INFO/DP>24304 | MQ<34' verbose : bool, optional Increase verbosity Returns ------- filename Path to the filtered VCF file ''' outfile = self.vcf + ".filtered.vcf.gz" Arg = namedtuple('Argument', 'option value') args = [ Arg('-s', name), Arg('-e', '\'{0}\''.format(expression)), Arg('-o', outfile), Arg('-O', 'z') ] runner = RunProgram(path=self.bcftools_folder, program='bcftools filter', args=args, parameters=[self.vcf]) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() return outfile
def number_variants_in_region(self, region, outprefix, verbose=None): ''' Method to get the number of variants in a particular region/s Parameters ---------- region : str String with path to the BED file containing the regions for which the number will be calculated. outprefix : str Prefix for outfile. verbose : bool, optional Increase verbosity. Returns ------- filename File with the number of variants for each particular region. ''' outprefix = outprefix + ".counts" params = ['-counts', '>', outprefix] Arg = namedtuple('Argument', 'option value') args = [Arg('-a', region), Arg('-b', self.vcf)] runner = RunProgram(path=self.bedtools_folder, program='bedtools coverage', args=args, parameters=params) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() return outprefix
def convert_PL2GL(self, outfile, threads=1, verbose=False): ''' Function to convert PL fields into GT. This function makes use of Bcftools +tag2tag plugin Parameters ---------- outfile : filename File where the output VCF will be written threads : int, optional Number of trades to use. Default=1 verbose : bool, optional increase the verbosity, default=False Returns ------- filename Path to the vcf.gz file with the PL fields converted ''' Arg = namedtuple('Argument', 'option value') params = [self.vcf, '-Oz', '--', '-r', '--pl-to-gl'] runner = RunProgram( path=self.bcftools_folder, program='bcftools +tag2tag', args=[Arg('--threads', threads), Arg('-o', outfile)], parameters=params) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() return outfile
def run_variantrecalibrator(self, resources, mode, max_gaussians=None, intervals=None, annotations=None, tranches=None, outprefix="recalibrate", verbose=None, log_file=None): ''' Run GATK's VariantRecalibrator on a VcfFilter object Parameters ---------- resources : filename JSON file with resources to add using the -resources option, Required mode : {'SNP','INDEL'} Recalibration mode to employ intervals : chr1:1-1000, optional One or more genomic intervals over which to operate max_gaussians : int, optional Max number of Gaussians for the positive model annotations : list, optional List of annotations to be used. Default=['DP', 'QD', 'FS', 'SOR', 'MQ', 'MQRankSum', 'ReadPosRankSum', 'InbreedingCoeff'] tranches : list, optional Each element in the list will correspond to the levels of truth sensitivity at which to slice the data. (in percent, that is 1.0 for 1 percent). Default=[100.0,99.9,99.0,90.0] outprefix : str, optional out prefix used for -recalFile, -tranchesFile, -rscriptFile. Default= recalibrate verbose : bool, optional Increase verbosity log_file : filename, optional Path to file that will used for logging the GATK stderr and stdout Returns ------- dict Dictionary with location of tranches and recal files ''' if annotations is None: annotations = [ 'DP', 'QD', 'FS', 'SOR', 'MQ', 'MQRankSum', 'ReadPosRankSum', 'InbreedingCoeff' ] if tranches is None: tranches = [100.0, 99.9, 99.0, 90.0] if self.caller != 'UG': raise Exception("VCF caller %s is incompatible" % self.caller) if mode not in ('SNP', 'INDEL'): raise Exception("VariantRecalibrator mode is not valid." "Valid values are 'SNP','INDEL'" % mode) # prepare the prefix used for output files outprefix += "_%s" % mode Arg = namedtuple('Argument', 'option value') args = [ Arg('-T', 'VariantRecalibrator'), Arg('-R', self.reference), Arg('-input', self.vcf), Arg('-mode', mode), Arg('-recalFile', "{0}.recal".format(outprefix)), Arg('-tranchesFile', "{0}.tranches".format(outprefix)), Arg('-rscriptFile', "{0}_plots.R".format(outprefix)) ] # Read-in the different resources from the resource JSON file resources_str = "" with open(resources) as data_file: data = json.load(data_file) bits = data['resources'] for dummy, dic in enumerate(bits): args.extend([ Arg( "-resource:%s,known=%s,training=%s," "truth=%s,prior=%.1f" % (dic['resource'], str(dic['known']).lower(), str(dic['training']).lower(), str( dic['truth']).lower(), dic['prior']), dic['path']) ]) # prepare the -an options for elt in annotations: args.append(Arg('-an', elt)) # prepare the list of -tranche option if type(tranches) == str: tranches = ast.literal_eval(tranches) for elt in tranches: args.append(Arg('-tranche', elt)) runner = RunProgram( program='java -jar {0}/GenomeAnalysisTK.jar'.format( self.gatk_folder), args=args, log_file=log_file) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout, stderr, is_error = runner.run_popen() recal_f = glob.glob("{0}*.recal".format(outprefix)) tranches_f = glob.glob("{0}*.tranches".format(outprefix)) if not recal_f: raise Exception( "No *.recal files were retrieved after running VariantRecalibrator" ) elif not tranches_f: raise Exception( "No *.tranches files were retrieved after running VariantRecalibrator" ) if len(recal_f) > 1: raise Exception("More than one *.recal file was retrieved") elif len(tranches_f) > 1: raise Exception("More than one *.tranches file was retrieved") return {'recal_f': recal_f[0], 'tranches_f': tranches_f[0]}
def filter_by_variant_type(self, outprefix, v_type="snps", compress=True, biallelic=False, action="select", verbose=None): ''' Method to filter a VCF file by variant type. For example, to extract only the SNPs Parameters ---------- v_type : {'snps','indels','mnps','other','both'} Default=snps Extract/Filter (depending on the value of the 'action' argument) a certain variant type compress : bool, optional If True then generate a vcf.gz file. Default=True biallelic : bool, optional Select only biallelic variants. Default=False action : {'select', 'exclude'} Default=select outprefix : str Prefix used for the output files verbose : bool, optional Increase verbosity Returns ------- filename Path to the filtered VCF ''' if v_type != "snps" and v_type != "indels" and v_type != "mnps" and v_type != "other" and v_type != "both": raise Exception("type value is not valid. Valid values are 'snps'/" "'indels'/'mnps'/'other'/'both'") if action != "select" and action != "exclude": raise Exception( "action value is not valid. Valid values are 'select' or 'exclude'" ) Arg = namedtuple('Argument', 'option value') args = [] params = [] if action == "select": if v_type != 'both': outprefix = outprefix + ".{0}.".format(v_type) args.append(Arg('-v', v_type)) elif action == "exclude": if v_type != 'both': outprefix = outprefix + ".no{0}.".format(v_type) args.append(Arg('-V', v_type)) if biallelic is True: outprefix += "biallelic." params.extend(['-m2', '-M2']) if compress is True: outprefix += "vcf.gz" args.extend([Arg('-o', outprefix), Arg('-O', 'z')]) params.append(self.vcf) elif compress is False: outprefix += "vcf" args.extend([Arg('-o', outprefix), Arg('-O', 'v')]) params.append(self.vcf) elif compress is None: raise Exception("'compress' parameter can't be None") runner = RunProgram(path=self.bcftools_folder, program='bcftools view', args=args, parameters=params) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() return outprefix
def run_bcftools(self, outprefix, E=False, p=False, annots=['DP','SP','AD'], P="ILLUMINA", F=0.002, \ C=50, m_pileup=1, m_call=False, d=250, v=False, O='z', ploidy="GRCh38", threads=1, S=None, r=None, verbose=True): ''' Run BCFTools mpileup and then pipe to BCTools call in order to do the variant calling Parameters ---------- outprefix : str, Required Prefix for output VCF file. i.e. /path/to/file/test E : bool, Optional mpileup parameter Recalculate BAQ on the fly, ignore existing BQ tags. Default=False p : bool, Optional mpileup parameter Apply -m and -F thresholds per sample to increase sensitivity of calling. By default both options are applied to reads pooled from all samples. annots : list, Optional mpileup parameter Comma separated list of annotations used to decorate the VCF P : str, Optional mpileup parameter Comma-delimited list of patforms (determined by @RG-PL) from which indel candidates are obtained. Default= ILLUMINA F : float, Optional mpileup parameter Minimum fraction of gapped reads. Default=0.002 C : int, Optional mpileup parameter Coefficient for downgrading mapping quality for reads containing excessive mismatches. Default=50 d : int, Optional mpileup parameter At a position, read maximally INT reads per input file. Default=250 m_mpileup : int, Optional mpileup parameter Minimum number gapped reads for indel candidates. Default=1 m_call : boolean, Optional call parameter alternative modelfor multiallelic and rare-variant calling designed to overcome known limitations in -c calling model v : bool, Optional call parameter output variant sites only O : str, Optional call parameter output type. Default= 'z' Possible values are: BCF (b), uncompressed BCF (u), compressed VCF (z), uncompressed VCF (v) ploidy : str, Optional predefined ploidy. Default: GRCh38 threads : int, Optional Number of extra output compression threads.Default=1 S : str, Optional call parameter File of sample names to include or exclude if prefixed with "^" r: str, Optional Region used for doing the variant calling in the format chr20:10000-20000 verbose : bool, Optional Increase verbosity. Default= True Returns ------- A VCF file with variants ''' Arg = namedtuple('Argument', 'option value') arguments_mpileup = [Arg('-f', self.reference)] for a in annots: arguments_mpileup.append(Arg('-a', a)) arguments_mpileup.append(Arg('-P', P)) arguments_mpileup.append(Arg('-F', F)) arguments_mpileup.append(Arg('-C', C)) arguments_mpileup.append(Arg('-d', d)) arguments_mpileup.append(Arg('-m', m_pileup)) arguments_mpileup.append(Arg('--threads', threads)) if r is not None: region_str = re.sub(':|-', '_', r) outprefix += ".{0}".format(region_str) arguments_mpileup.append(Arg('-r', r)) params_mpileup = [] if E is True: params_mpileup.append('-E') if p is True: params_mpileup.append('-p') params_mpileup.append(self.bam) params_call = [] if m_call is True: params_call.append('-m') if v is True: params_call.append('-v') arguments_call = [] arguments_call.append(Arg('-O', O)) arguments_call.append(Arg('--ploidy', ploidy)) if S is not None: arguments_call.append(Arg('-S', S)) outprefix += ".vcf.gz" arguments_call.append(Arg('-o', outprefix)) pipelist = None bcftools_callRunner = RunProgram(program='bcftools call', args=arguments_call, parameters=params_call) pipelist = [bcftools_callRunner] runner = RunProgram(program='{0}/bcftools mpileup'.format( self.bcftools_folder), args=arguments_mpileup, parameters=params_mpileup, downpipe=pipelist) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout, stderr, is_error = runner.run_popen(raise_exc=False) return outprefix
def run_ug(self, outprefix, glm='SNP', compress=True, nt=1, verbose=None, intervals=None, log_file=None, **kwargs): ''' Run GATK UnifiedGenotyper Parameters ---------- outprefix : str, Required Prefix for output VCF file. i.e. /path/to/file/test glm : str, Required Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together. Default= SNP compress : boolean, Default= True Compress the output VCF nt : int, Optional Number of data threads to allocate to UG intervals : : list, Optional List in which each of the elements is a path to file with genomic intervals to operate with. Also coordinates can be set directly on the command line. For example: ['chr1:100-200', 'chr2:200-300']. If the list contains more than one interval, then it is useful to set the --interval_set_rule option verbose : bool, optional if true, then print the command line used for running this program alleles: str, Optional Path to VCF. When --genotyping_mode is set to GENOTYPE_GIVEN_ALLELES mode, the caller will genotype the samples using only the alleles provide in this callset genotyping_mode: str, Optional Specifies how to determine the alternate alleles to use for genotyping Possible values are: DISCOVERY, GENOTYPE_GIVEN_ALLELES output_mode: str, Optional Which type of calls we should output. Possible values are: EMIT_VARIANTS_ONLY, EMIT_ALL_CONFIDENT_SITES, EMIT_ALL_SITES Default: EMIT_VARIANTS_ONLY log_file : str, Optional Path to file that will used for logging the GATK stderr and stdout Returns ------- A VCF file ''' Arg = namedtuple('Argument', 'option value') arguments = [ Arg('-T', 'UnifiedGenotyper'), Arg('-R', self.reference), Arg('-I', self.bam), Arg('-glm', glm), Arg('-nt', nt) ] if intervals is not None: for i in intervals: arguments.append(Arg('--intervals', i)) for k, v in kwargs.items(): if v is not None: arguments.append(Arg("--{0}".format(k), v)) pipelist = None if compress is True: outprefix += ".vcf.gz" compressRunner = RunProgram(path=self.bgzip_folder, program='bgzip', parameters=['-c', '>', outprefix]) pipelist = [compressRunner] else: outprefix += ".vcf" arguments.append(Arg('-o', outprefix)) runner = RunProgram( program='java -jar {0}/GenomeAnalysisTK.jar'.format( self.gatk_folder), args=arguments, downpipe=pipelist, log_file=log_file) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout, stderr, is_error = runner.run_popen(raise_exc=False) if is_error is True: ''' This piece of code is necessary as GATK crashes when the intersection between the genomic chunk and the alleles passed in the VCF are calculated and there are no sites. If that's the case then GATK will be run without the interval intersection ''' patt = re.compile( '##### ERROR MESSAGE: Bad input: ' 'The INTERSECTION of your -L options produced no intervals.') lines = stderr.split('\n') interval_error_seen = False for l in lines: m = patt.match(l) if m: interval_error_seen = True alleles = ([ arg.value for arg in arguments if arg.option == '--alleles' ])[0] for k, i in enumerate(arguments): if i.option == '--intervals' and i.value == alleles: del arguments[k] elif i.option == '--interval_set_rule': del arguments[k] if interval_error_seen is False: raise Exception(stderr) elif interval_error_seen is True: runner = RunProgram( program='java -jar {0}/GenomeAnalysisTK.jar'.format( self.gatk_folder), downpipe=pipelist, args=arguments, log_file=log_file) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout, stderr, is_error = runner.run_popen(raise_exc=True) return outprefix
def convert2vcf(self, input_prefix, output_prefix, compress=False, verbose=False, logfile=None): ''' Function to use SHAPEIT's -convert in order to convert the *.haps.gz & *.haps.sample files into VCF Parameters ---------- input_prefix : str Prefix for the files in HAPS/SAMPLE format output_prefix : str String with the output prefix for the VCF file verbose : bool, optional if true, then print the command line used for running this program logfile : filename, optional Path to log file Returns ------- filename A VCF file ''' if self.shapeit_folder is None: raise Exception("shapeit_folder must be defined") outfile = "{0}.vcf".format(output_prefix) Arg = namedtuple('Argument', 'option value') args = [ Arg('--input-haps', '{0}.gz {0}.sample'.format(input_prefix)), Arg('--output-vcf', outfile) ] if logfile is not None: args.append(Arg('--output-log', logfile)) compressRunner = None if compress is True: runner = RunProgram(path=self.shapeit_folder, program='shapeit -convert', args=args) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) runner.run_checkoutput() compressRunner = RunProgram( path=self.bgzip_folder, program='bgzip', parameters=['-c', outfile, '>', outfile + ".gz"]) compressRunner.run_checkoutput() os.remove(outfile) outfile = outfile + ".gz" else: runner = RunProgram(path=self.shapeit_folder, program='shapeit -convert', args=arguments) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) runner.run_checkoutput() return outfile
def calc_concordance(self, truth_vcf, truth_sample, call_sample, outprefix, outdir=None, intervals=None, verbose=None): ''' Method to calculate the genotype concordance between VcfQC.vcf and Truth VCF. It will run Picard's GenotypeConcordance Parameters ---------- truth_vcf : str The VCF containing the truth sample. truth_sample : str The name of the truth sample within the truth VCF. call_sample : str The name of the call sample within the call VCF. outprefix : str String used as the prefix in the output file. outdir : str, optional If provided, then put output files in this folder. intervals : str One or more interval list files that will be used to limit the genotype concordance. verbose : bool, optional Ff true, then print the command line used for running this program. Returns ------- GTPconcordance object ''' if self.picard_folder is None: raise Exception("Folder containing Picard jar file is required") if outdir: outprefix = "%s/%s" % (outdir, outprefix) Arg = namedtuple('Argument', 'option value') args = [ Arg('TRUTH_VCF', truth_vcf), Arg('CALL_VCF', self.vcf), Arg('TRUTH_SAMPLE', truth_sample), Arg('CALL_SAMPLE', call_sample), Arg('O', outprefix) ] if intervals: args.append(Arg('INTERVALS', intervals)) runner = RunProgram( program='java -jar {0}/picard.jar GenotypeConcordance'.format( self.picard_folder), args=args, arg_sep='=') if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() gtp_con = GTPconcordance(summary_metrics_file=outprefix+\ ".genotype_concordance_summary_metrics") return gtp_con
def stats(self, outpath, filter_str=None, region=None, region_file=None, verbose=None): ''' Run bcftools stats on the VCF file Parameters ---------- outpath : str output path filter_str : str, optional. Example: PASS,. Apply filters when calculating the stats. region : str, optional Example: chr20 Region used for calculating the stats. region_file : filename, optional BED file with the regions that will be analyzed. verbose : bool, optional Returns ------- BcftoolsStats object ''' Arg = namedtuple('Argument', 'option value') args = [] if region != None: outpath = "{0}.{1}".format(outpath, region) args.append(Arg('-r', region)) if region_file != None: args.append(Arg('-R', region_file)) if filter_str != None: outpath = outpath + ".filter_str" args.append(Arg('-f', filter_str)) outpath = outpath + ".stats" params = [self.vcf, '>', outpath] runner = RunProgram(path=self.bcftools_folder, program='bcftools stats', args=args, parameters=params) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) runner.run_checkoutput() stats = BcftoolsStats(filename=outpath) with open(outpath) as fi: d = {} for line in fi: line = line.rstrip('\n') if line.startswith('SN\t'): key = line.split('\t')[2] value = int(line.split('\t')[3]) d[key] = value elif line.startswith('TSTV\t'): ts_tv = line.split('\t')[4] ts_tv_1stalt = line.split('\t')[7] stats.ts_tv = ts_tv stats.ts_tv_1stalt = ts_tv_1stalt elif line.startswith('SiS\t'): no_singleton_snps = line.split('\t')[3] stats.no_singleton_snps = no_singleton_snps stats.summary_numbers = d return stats
def run_applyrecalibration(self, mode, recal_file, tranches_file, outprefix, ts_filter_level=99.0, num_threads=1, compress=True, verbose=None, log_file=None): ''' Run GATK's ApplyRecalibration on a VcfFilter object Parameters ---------- mode : {'SNP','INDEL'} Recalibration mode to employ recal_file : filename The input recal file used by ApplyRecalibration tranches_file : filename The input tranches file describing where to cut the data outprefix : str Prefix used for the output ts_filter_level : float, optional The truth sensitivity level at which to start filtering. Default=99.0 num_threads : int, optional Number of data threads to allocate to this analysis. Default=1 compress : bool Compress the recalibrated VCF. Default= True verbose : bool, optional Increase verbosity log_file : filename, optional Path to file that will used for logging the GATK stderr and stdout Returns ------- filename Path to filtered VCF file ''' if self.caller != 'UG': raise Exception("VCF type %s is incompatible" % self.caller) if mode != 'SNP' and mode != 'INDEL': raise Exception("ApplyRecalibration mode is not valid." "Valid values are 'SNP','INDEL'" % mode) # generate output file name outfile = "" if mode == 'SNP': outfile += "%s.recalibrated_snps_raw_indels.vcf" % outprefix elif mode == 'INDEL': outfile += "%s.recalibrated_variants.vcf" % outprefix Arg = namedtuple('Argument', 'option value') args = [] args.extend([ Arg('-jar', '{0}/GenomeAnalysisTK.jar'.format(self.gatk_folder)), Arg('-T', 'ApplyRecalibration'), Arg('-R', self.reference), Arg('-input', self.vcf), Arg('-mode', mode), Arg('--ts_filter_level', ts_filter_level), Arg('-recalFile', recal_file), Arg('--num_threads', num_threads), Arg('-tranchesFile', tranches_file) ]) pipelist = None if compress is True: outfile += ".gz" compressRunner = RunProgram(path=self.bgzip_folder, program='bgzip', parameters=['-c', '>', outfile]) pipelist = [compressRunner] else: args.append(Arg('-o', outfile)) program_str = None if self.tmp_dir is not None: program_str = "java -Djava.io.tmpdir={0}".format(self.tmp_dir) else: program_str = "java" runner = RunProgram(program=program_str, args=args, downpipe=pipelist, log_file=log_file) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout, stderr, is_error = runner.run_popen() # create an index for the recalibrated file if compress is True: tabixRunner = RunProgram(path=self.tabix_folder, program='tabix', parameters=[outfile]) stdout = tabixRunner.run_checkoutput() return outfile
def get_chros(self, filter_str=None, chr_f=None, verbose=None): ''' Method to get a list of chromosomes present in a file Parameters ---------- filter_str : str, optional If defined, apply this filter string so bcftools view apply it before fetching the chros. chr_f : str, optional Path to file with a list of chromosomes (one per line). If provided, the chros in the file will be compared with the chromosomes in self.vcf. verbose : bool, optional Increase verbosity. Returns ------- dict Dict with a key named 'in_vcf' and whose values are the chros that are present in self.vcf. If list_of_chros is defined, then it will also add 3 keys to the dict: 'both' whose values will be the chros present in self.vcf and in 'chr_f' 'in_A' whose values will be the chros PRESENT in self.vcf and NOT in 'chr_f' 'in_B' whose values will be the chros NOT present in self.vcf and PRESENT in 'chr_f'. ''' params = ['--no-header', self.vcf, "|cut -f1 |uniq"] Arg = namedtuple('Argument', 'option value') args = [] if filter_str != None: args.append(Arg('-f', filter_str)) runner = RunProgram(path=self.bcftools_folder, program='bcftools view', args=args, parameters=params) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) out_str = "" out = runner.run_checkoutput() out_str = out.decode("utf-8") out_str = out_str.rstrip('\n') list_of_chros = out_str.split("\n") chr_list_f = [] if chr_f != None: #parse file with chros chr_file = open(chr_f, 'r') chr_list_f = chr_file.read().splitlines() both = set(list_of_chros) & set(chr_list_f) in_a = set(list_of_chros) - set(chr_list_f) in_b = set(chr_list_f) - set(list_of_chros) return { 'in_vcf': list_of_chros, 'both': list(both), 'in_A': list(in_a), 'in_B': list(in_b) }
def subset_vcf(self, outprefix, bed=None, region=None, outdir=None, create_index=False, verbose=None, action='exclude', apply_filters=None, threads=1): ''' Subset the vcf file using a BED file/region having the coordinates of the variants to exclude/include Parameters ---------- bed : str, optional BED file with coordinates to exclude/include region : str, optional String with region to consider: chr1, chr1:1000-1500, etc... outprefix : str Prefix for outputfiles outdir : str, optional If provided, then put output files in this folder create_index : bool, optional Generate a tabix index. Default=False verbose : bool, optional verbose action : str, optional Exclude or include variants from the bed file passed through the bed option. Default= exclude apply_filters : str, optional Apply a filter string: i.e. "PASS,." threads : int, optional Number of output compression threads to use in addition to main thread. Default=0 Returns ------- filename Path to gzipped VCF file that will have the desired variants excluded/included ''' if action != 'include' and action != 'exclude': raise Exception( "action argument should be either include or exclude") if region: bits = outprefix.split(".") vcf_ix = bits.index("vcf") new = "" if apply_filters is not None: new = bits[vcf_ix - 1] + "_" + region + ".filt" else: new = bits[vcf_ix - 1] + "_" + region bits[vcf_ix - 1] = new outprefix = ".".join(bits) if outdir: outprefix = "%s/%s" % (outdir, outprefix) Arg = namedtuple('Argument', 'option value') args = [] if bed: if action == 'exclude': args.append(Arg('-T', '^{0}'.format(bed))) elif action == 'include': args.append(Arg('-T', '{0}'.format(bed))) elif region: if action == 'exclude': args.append(Arg('-t', '^{0}'.format(region))) elif action == 'include': args.append(Arg('-r', '{0}'.format(region))) args.extend( [Arg('-o', outprefix), Arg('-O', 'z'), Arg('--threads', threads)]) if apply_filters is not None: args.append(Arg('-f', '\"{0}\"'.format(apply_filters))) runner = RunProgram(path=self.bcftools_folder, program='bcftools view', args=args, parameters=[self.vcf]) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() return outprefix
def run_hc(self, outprefix, compress=True, verbose=None, log_file=None, intervals=None, **kwargs): ''' Run GATK HaplotypeCaller Parameters ---------- outprefix : str, Required Prefix for output VCF file. i.e. /path/to/file/test compress : boolean, Default= True Compress the output VCF num_cpu_threads_per_data_thread : int, Optional controls the number of CPU threads allocated to each data thread intervals : list, Optional List in which each of the elements is a path to file with genomic intervals to operate with. Also coordinates can be set directly on the command line. For example: ['chr1:100-200', 'chr2:200-300']. If the list contains more than one interval, then it is useful to set the --interval_set_rule option standard_min_confidence_threshold_for_calling : int, Optional The minimum phred-scaled confidence threshold at which variants should be called Default: 10 genotyping_mode: str, Optional Specifies how to determine the alternate alleles to use for genotyping Possible values are: DISCOVERY, GENOTYPE_GIVEN_ALLELES alleles: str, Optional Path to VCF. When --genotyping_mode is set to GENOTYPE_GIVEN_ALLELES mode, the caller will genotype the samples using only the alleles provide in this callset emitRefConfidence: str, Optional Mode for emitting reference confidence scores Possible values are: NONE, BP_RESOLUTION, GVCF verbose : bool, optional if true, then print the command line used for running this program log_file : str, Optional Path to file that will used for logging the GATK stderr and stdout Returns ------- A VCF file ''' Arg = namedtuple('Argument', 'option value') arguments = [ Arg('-T', 'HaplotypeCaller'), Arg('-R', self.reference), Arg('-I', self.bam) ] if intervals is not None: for i in intervals: arguments.append(Arg('--intervals', i)) for k, v in kwargs.items(): if v is not None: arguments.append(Arg("--{0}".format(k), v)) pipelist = None if compress is True: outprefix += ".vcf.gz" compressRunner = RunProgram(path=self.bgzip_folder, program='bgzip', parameters=['-c', '>', outprefix]) pipelist = [compressRunner] else: outprefix += ".vcf" arguments.append(Arg('-o', outprefix)) runner = RunProgram( program='java -jar {0}/GenomeAnalysisTK.jar'.format( self.gatk_folder), downpipe=pipelist, args=arguments, log_file=log_file) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout, stderr, is_error = runner.run_popen(raise_exc=False) if is_error is True: ''' This piece of code is necessary as GATK crashes when the intersection between the genomic chunk and the alleles passed in the VCF are calculated and there are no sites. If that's the case then GATK will be run without the interval intersection ''' patt = re.compile( '##### ERROR MESSAGE: Bad input: The INTERSECTION of your' ' -L options produced no intervals.') lines = stderr.split('\n') interval_error_seen = False for l in lines: m = patt.match(l) if m: interval_error_seen = True alleles = ([ arg.value for arg in arguments if arg.option == '--alleles' ])[0] for k, i in enumerate(arguments): if i.option == '--intervals' and i.value == alleles: del arguments[k] elif i.option == '--interval_set_rule': del arguments[k] if interval_error_seen is False: raise Exception(stderr) elif interval_error_seen is True: runner = RunProgram( program='java -jar {0}/GenomeAnalysisTK.jar'.format( self.gatk_folder), downpipe=pipelist, args=arguments, log_file=log_file) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout, stderr, is_error = runner.run_popen(raise_exc=True) return outprefix
def run_beagle(self, outprefix, outdir=None, region=None, verbose=False, correct=False, **kwargs): ''' Method that wraps Beagle (see https://faculty.washington.edu/browning/beagle/beagle.html) and will be used to call genotypes on a VCF file containing GT likelihoods Parameters ---------- outprefix: str, required Prefix used for output file outdir : str, optional outdir for output files region : str, optional chr or chr interval that will be analyzed. i.e. chr20 or chr20:10000000-11000000 verbose : bool, optional if true, then print the command line used for running Beagle correct : bool, optional Note: that it seems there is an incompatibility between zlib libraries used in Beagle4 and in BOOST on some platforms. This involves either the last line of the file being skipped or a segfault. If correct=True, then this function will fix this issue by recompressing the Beagle4 output files. Default=False window: int, optional number of markers to include in each sliding window. Default: 50000 overlap: int, optional specifies the number of markers of overlap between sliding windows. Default: 3000 niterations: unt, optional specifies the number of phasing iterations. Default: niterations=5 nthreads : int, optional number of threads. If not specified then the nthreads parameter will be set equal to the number of CPU cores on the host machine Returns ------- Compressed VCF file with the genotype calls ''' if self.beagle_folder is None or self.beagle_jar is None: raise Exception( "Provide the folder containing the Beagle jar file and the Beagle jar file name" ) Arg = namedtuple('Argument', 'option value') args = [] outfile = "" if outdir is not None: outfile = "{0}/{1}.".format(outdir, outprefix) else: outfile = "{0}.".format(outprefix) if region is not None: region_str = re.sub(":|-", ".", region) outfile += "{0}.".format(region_str) args.append(Arg('chrom', region)) outfile += "beagle" args.extend([Arg('gl', self.vcf), Arg('out', outfile)]) for k, v in kwargs.items(): args.append(Arg(k, v)) runner = RunProgram(program='java -jar {0}/{1}'.format( self.beagle_folder, self.beagle_jar), args=args, arg_sep="=") if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() outfile = outfile + ".vcf.gz" if correct is True: # creating temp file in order to perform the correction temp = tempfile.NamedTemporaryFile(delete=False) gzipRunner = RunProgram(program='gzip', parameters=['-c', '>', temp.name]) zcatRunner = RunProgram(program='zcat', parameters=[outfile], downpipe=[gzipRunner]) if verbose is True: print( "Command line for vcf.gz correction partA is: {0}".format( zcatRunner.cmd_line)) #run zcat file | gzip -c > tmp.file zcatRunner.run_checkoutput() #mv tmp.file back to outfile mvRunner = RunProgram(program='mv', parameters=[temp.name, outfile]) if verbose is True: print( "Command line for vcf.gz correction partB is: {0}".format( mvRunner.cmd_line)) mvRunner.run_checkoutput() return outfile
def run_vcfallelicprimitives(self, outprefix, compress=True, outdir=None, keepinfo=True, keepgeno=True, downstream_pipe=None, verbose=None): ''' Run vcfallelicprimitives on a vcf file This program is used to decompose complex variants into a canonical SNP and indel representation,generating phased genotypes for available samples. Parameters ---------- outprefix : str, required prefix for outputfiles compress : boolean, optional bgzip compress the normalized VCF outdir : str, optional If provided, then put output files in this folder keepinfo : bool, optional. Default=True Maintain site and allele-level annotations when decomposing. Note that in many cases, such as multisample VCFs, these won't be valid post-decomposition. For biallelic loci in single-sample VCFs, they should be usable with caution keepgeno : bool, optional. Default=True Maintain genotype-level annotations when decomposing. Similar caution should be used for this as for keep-info. downstream_pipe : str, optional If defined, then pipe the output VCF to other tools. i.e. "~/bin/vt/vt sort - | ~/bin/vt/vt uniq -" verbose : bool, optional if true, then increase verbosity Returns ------- A string with path to decomposed file ''' if outdir: outprefix = "{0}/{1}".format(outdir, outprefix) outprefix = outprefix+".aprimitives.vcf" params=[self.vcf] if keepinfo is True: params.append('--keep-info') if keepgeno is True: params.append('--keep-geno') if downstream_pipe is not None: params.append("| {0}".format(downstream_pipe)) runner=None pipelist=None if compress is True: outprefix += ".gz" compressRunner=RunProgram(path=self.bgzip_folder,program='bgzip',parameters=[ '-c', '>', outprefix]) pipelist=[compressRunner] elif compress is None or compress is False: params.extend(['>',outprefix]) runner=RunProgram(path=self.vcflib_folder, program='vcfallelicprimitives', parameters=params, downpipe=pipelist) if verbose is True: print("Command line for running vcfallelicprimitives is: {0}".format(runner.cmd_line)) runner.run_checkoutput() return outprefix
def run_shapeit(self, output_prefix, input_gen=None, input_init=None, input_scaffold=None, input_bed=None, duohmm=False, input_map=None, verbose=False, **kwargs): ''' Run Shapeit Parameters ---------- input_gen : str, optional Specifies the genotype/GL input data that you obtain from Beagle4, i.e. 'input.shapeit.20.gen.gz input.shapeit.20.gen.sample' input_init : str, optional Specifies the haplotypes that you obtain from Beagle4, i.e. 'input.shapeit.20.hap.gz input.shapeit.20.hap.sample' input_scaffold : str, optional SNP-array derived haplotype scaffold used by SHAPEIT. It has to be in Impute2 format. i.e. 'scaffold.haps.gz scaffold.haps.sample' input_bed : str, optional Unphased genotypes in Plink Binary BED/BIM/FAM format. i.e. 'file.bed file.bim file.fam' duohmm : bool, optional If true, then activate the --duohmm option. Default: False output_prefix : str Prefix used for the 2 output files estimated by SHAPEIT, i.e. 'output.shapeit.20.haps.gz output.shapeit.20.haps.sample' input_map : filename, optional Path to the file with the genetic map i_from : int, optional Specify the region to be phased i_to : int, optional Specify the region to be phased verbose : bool, optional If true, then print the command line used for running this program Returns ------- dict A dict with the path to the 2 output files (*.haps.gz and *.haps.sample) that can be used with SHAPEIT ''' if input_gen is None and input_bed is None: raise Exception( "Error! Either --input-gen or --input-bed need to be specified as input for SHAPEIT" ) Arg = namedtuple('Argument', 'option value') args = [] if input_gen is not None: args.append(Arg('-call --input-gen', input_gen)) elif input_bed is not None: args.append(Arg('--input-bed', input_bed)) if input_init is not None: args.append(Arg('--input-init', input_init)) if input_scaffold is not None: args.append(Arg('--input-scaffold', input_scaffold)) if input_map is not None: args.append(Arg('--input-map', input_map)) for k, v in kwargs.items(): args.append(Arg('--{0}'.format(k), v)) args.extend([ Arg('--output-max', '{0}.haps.gz {0}.haps.sample'.format(output_prefix)), Arg('--output-log', '{0}.log'.format(output_prefix)) ]) params = [] if duohmm is True: params = ['--duohmm'] runner = RunProgram(path=self.shapeit_folder, program='shapeit', args=args, parameters=params) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout = runner.run_checkoutput() outdict = { 'hap_gz': '{0}.haps.gz'.format(output_prefix), 'hap_sample': '{0}.haps.sample'.format(output_prefix) } return outdict
def combine(self, labels, reference, outprefix, compress=False, outdir=None, ginterval=None, genotypemergeoption=None, filteredrecordsmergetype=None, threads=1, options=None, verbose=False): ''' Combine VCFs using GATK's CombineVariants into a single VCF Parameters ---------- labels : list List of labels used for each of the VCFs in self.vcflist. The order of the labels should be the same that the VCFs in the list reference : str Path to Fasta file with reference outprefix : str Prefix used for output file compress : bool, optional Compress the output VCF with bgzip. Default=False outdir : str, optional Path to folder used to write the results to ginterval : str, optional Genomic interval used to restrict the analysis. i.e. chr20:1000-2000 genotypemergeoption : {'UNIQUIFY', 'PRIORITIZE', 'UNSORTED', 'REQUIRE_UNIQUE'}, optional Determines how we should merge genotype records for samples shared across the ROD files filteredrecordsmergetype : {'KEEP_IF_ANY_UNFILTERED', 'KEEP_IF_ANY_UNFILTERED', 'KEEP_UNCONDITIONAL'}, optional Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields threads : int, optional Number of trades to use. Default=1 options : list, optional List of options. i.e. ['-env','--filteredAreUncalled'] verbose : bool, optional increase the verbosity, default=False Returns ------- filename Path to the merged VCF ''' Arg = namedtuple('Argument', 'option value') args = [ Arg('-T', 'CombineVariants'), Arg('-R', reference), Arg('-nt', threads) ] variants_str = "" for path, label in zip(self.vcflist, labels): if os.path.isfile(path) == False: print("Error reading from {0}".format(path)) raise Exception("File does not exist") args.append(Arg('-V:{0}'.format(label), path)) outfile = "" if outdir: outfile = "{0}/".format(outdir) outfile += "{0}.vcf".format(outprefix) if ginterval is not None: args.append(Arg('-L', ginterval)) if genotypemergeoption is not None: args.append(Arg('--genotypemergeoption', genotypemergeoption)) if filteredrecordsmergetype is not None: args.append( Arg('--filteredrecordsmergetype', filteredrecordsmergetype)) params = [] if options: for opt in options: params.append(opt) pipelist = None if compress is True: outfile += ".gz" compressRunner = RunProgram(path=self.bgzip_folder, program='bgzip', parameters=['-c', '>', outfile]) pipelist = [compressRunner] else: args.append(Arg('-o', outfile)) runner = RunProgram( path=self.java_folder, program='java -jar {0}/GenomeAnalysisTK.jar'.format( self.gatk_folder), args=args, parameters=params, downpipe=pipelist) if verbose is True: print("Command line is: {0}".format(runner.cmd_line)) stdout, stderr, is_exc = runner.run_popen() return outfile