Beispiel #1
0
def convert_sam_to_bam(sam):
  util.info('Converting %s to bam format so to save disk space...' % sam) 
  bam_file = sam.strip('.sam') + '.bam'
  cmdArgs = ['samtools','view','-bh',sam,'-o',bam_file]
  util.call(cmdArgs)
  os.remove(sam)
  return(bam_file)
def gatk_merge_vcfs(dir_name,
                    strain_vcf_paths,
                    genome_fasta_path,
                    num_cpu=util.MAX_CORES):

    merge_file_path = _get_merged_vcf_path(dir_name, strain_vcf_paths.keys(),
                                           CALLER_GATK)

    if os.path.exists(merge_file_path):
        util.info('%s already exists and won\'t be overwritten...' %
                  merge_file_path)

    else:
        cmd_args = list(util.JAVA) + [
            '-jar',
            exe.EXE[CALLER_GATK],
            '-T',
            'GenotypeGVCFs',
            '-R',
            genome_fasta_path,
            #'-nt', str(min(8, num_cpu)), # Seems to fail with multiple CPU threads...
            '-o',
            merge_file_path
        ]

        for strain in strain_vcf_paths:
            cmd_args += ['-V', strain_vcf_paths[strain]]

        util.call(cmd_args)

    return merge_file_path
def call_genotype_gatk(strain_bam_paths, genome_fasta_path, num_cpu, out_dir,
                       sub_dir_name):
    # GATK pipeline - parallelise strains in python

    genome_index_file = genome_fasta_path + '.fai'
    genome_dict_file = os.path.splitext(genome_fasta_path)[0] + '.dict'

    if not os.path.exists(genome_index_file):
        util.info('Making index for genome FASTA file %s' % genome_fasta_path)
        cmd_args = [exe.EXE['samtools'], 'faidx', genome_fasta_path]
        util.call(cmd_args)

    if not os.path.exists(genome_dict_file):
        util.info('Making dict filr for genome FASTA file %s' %
                  genome_fasta_path)
        cmd_args = [
            exe.EXE['samtools'], 'dict', genome_fasta_path, '-o',
            genome_dict_file
        ]
        util.call(cmd_args)

    strains = sorted(strain_bam_paths)

    bam_paths = [strain_bam_paths[s] for s in strains
                 ]  # Each parallel call will be sent one of these
    common_args = [genome_fasta_path, sub_dir_name]  # All tasks share this

    vcf_paths = util.parallel_split_job(gatk_haplotype_job,
                                        bam_paths,
                                        common_args,
                                        num_cpu,
                                        collect_output=True)
    # BAM and VCF path will be in corresponding order

    # Multi-sample
    strain_vcf_paths = {strains[i]: p for i, p in enumerate(vcf_paths)}
    merged_vcf_path = gatk_merge_vcfs(out_dir,
                                      strain_vcf_paths,
                                      genome_fasta_path,
                                      num_cpu=num_cpu)

    return merged_vcf_path
def gatk_haplotype_job(bam_file_path, genome_fasta_path, sub_dir_name):

    path_root, file_ext = os.path.splitext(bam_file_path)
    dir_name, file_root = os.path.split(path_root)

    vcf_dir_name = os.path.join(dir_name, sub_dir_name)
    # vcf_file_path  = os.path.join(vcf_dir_name, '%s_hap.vcf' % (file_root))
    gvcf_file_path = os.path.join(vcf_dir_name, '%s_hap.g.vcf' % (file_root))

    if os.path.exists(gvcf_file_path):
        util.info(
            "VCF file %s already exists. Skipping haplotype calling for %s" %
            (gvcf_file_path, file_root))
        return gvcf_file_path

    util.makedirs(vcf_dir_name, exist_ok=True)

    util.info('Creating GVCF file for %s using GATK' % file_root)

    cmd_args = list(util.JAVA) + [
        '-jar',
        exe.EXE[CALLER_GATK],
        '-T',
        'HaplotypeCaller',
        '-R',
        genome_fasta_path,
        '-I',
        bam_file_path,
        '-o',
        gvcf_file_path,
        '-ERC',
        'GVCF',
        '-variant_index_type',
        'LINEAR',  # Deprecated for GATK 4.0
        '-variant_index_parameter',
        '128000'
    ]  # Deprecated for GATK 4.0
    util.call(cmd_args)

    return gvcf_file_path
Beispiel #5
0
 def sam_parser(sam_file,aligner,remove_sam = True):
   ext = '.sam'
   if '.bam' in sam_file:
     ext = '.bam'
   counts_file = sam_file.strip(ext) + '_lib_guidecounts.txt'
   counts_log = sam_file.strip(ext) + '_lib_guidecounts.log'
   sam_parser_to_guide_counts = os.path.dirname(os.path.realpath(__file__)) + '/sam_parser_to_guide_counts.sh'
   if convert_to_bam:
     util.info('Removing sam header from %s in order to proceed to read counting...' % sam_file)
     # Remove header
     temp = sam_file.strip(ext) + '_temp.sam'
     # Remove unaligned reads if there are any. 
     # This is particularly important when using bowtie because the --no-unal flag doesn't really work.
     cmdArgs = ['samtools','view','-F','4',sam_file,'-o',temp] 
     util.call(cmdArgs)
     util.info('Counting reads from %s...' % sam_file)
     cmdArgs = [sam_parser_to_guide_counts,temp,counts_file,aligner]
     util.call(cmdArgs,stderr=counts_log)
     os.remove(temp)
   else:
     util.info('Counting reads from %s...' % sam_file)
     cmdArgs = [sam_parser_to_guide_counts,sam_file,counts_file,aligner]
     util.call(cmdArgs,stderr=counts_log)
   if remove_sam is True and ext is '.sam':
     os.remove(sam_file)
   return(counts_file)
Beispiel #6
0
def plot_coverage(coverage_files,output=None):
  outdir = coverage_files[0]
  outdir = outdir.split("/")
  outdir = outdir[0:-2]
  outdir = "/".join(outdir)
  if output is None:
    output = util.get_rand_string(8)
    util.info("Header for output files has not been specified. Random string %s will be used instead...") % output
  all_strains_cov = "%s/%s_all_strains_cov.txt" % (outdir,output)
  fileObj1 = open(all_strains_cov,"w")

  fileObj1.write("Genome_cov\tExon_cov\n")

  for f in coverage_files:
    strain = f.split("/")
    strain = strain[-1]
    strain = strain.split("_")
    strain = strain[0]
    fileObj = open(f,"r")
    line1 =  fileObj.readline()
    line1 = line1.split(" ")
    genome_cov = line1[4]
    genome_cov = genome_cov.rstrip()
    line2 = fileObj.readline()
    line2 = line2.split(" ")
    exon_cov = line2[7]
    exon_cov = exon_cov.rstrip()
    towrite = "\t".join([strain,genome_cov,exon_cov]) + "\n"
    fileObj1.write(towrite)
    fileObj.close()

  fileObj1.close()
  
  util.info("File saved as %s..." % all_strains_cov)
  
  cmdArgs = ['Rscript','--vanilla',  exe.EXE['phc'],
  all_strains_cov,output]
  
  util.call(cmdArgs)
def freebayes_genotype_job(region, genome_fasta_path, bam_paths):

    out_vcf_path = 'temp_%s_freebayes.vcf' % region

    if not os.path.exists(out_vcf_path):

        cmd_args = [
            exe.EXE['freebayes'],
            # '--no-mnps', # make this optional
            # '--no-complex', # make this optional
            '-f',
            genome_fasta_path,
            '-r',
            region,
            '-v',
            out_vcf_path
        ]  #, '--ploidy', '2']

        cmd_args += bam_paths

        util.call(cmd_args)

    return out_vcf_path
def gatk_select_vars(strain_name,
                     merged_vcf_path,
                     genome_fasta_path,
                     file_tag='extracted',
                     homozygous=True):

    dir_name, file_name = os.path.split(merged_vcf_path)

    out_vcf_path = os.path.join(dir_name,
                                '%s_%s.vcf' % (strain_name, file_tag))
    # Original naming: "%s_sorted_f3_F4_q1_mark_dups_w_mate_cig_gatk_hap_call_extracted.vcf" % strain_name

    util.info('Creating VCF file for %s' % strain_name)

    cmd_args = list(util.JAVA) + [
        '-jar', exe.EXE[CALLER_GATK], '-T', 'SelectVariants', '-R',
        genome_fasta_path, '-V', merged_vcf_path, '-o', out_vcf_path, '-sn',
        'sample_%s' % strain_name
    ]

    if homozygous:
        cmd_args += [
            '-select',
            "vc.getGenotype('sample_%s').isHomVar()" % strain_name
        ]  # Check quotes

    else:
        # cmd_args += ['-select', "! vc.getGenotype('sample_%s').isHomRef()" % strain_name]
        cmd_args += [
            '-select',
            "vc.getGenotype('sample_%s').isHomVar() || vc.getGenotype('sample_%s').isHet() && ! vc.getGenotype('sample_%s').isHomRef()"
            % (strain_name, strain_name, strain_name)
        ]

    util.call(cmd_args)
    util.info('All done for strain %s. VCF file can be found in %s' %
              (strain_name, out_vcf_path))
def call_genotype_freebayes(strain_bam_paths, genome_fasta_path, num_cpu,
                            out_dir, sub_dir_name):
    # FreeBayes pipeline

    strain_names, bam_file_paths = zip(*list(strain_bam_paths.items()))

    merge_file_path = _get_merged_vcf_path(out_dir, bam_file_paths,
                                           CALLER_FREEBAYES)

    if os.path.exists(merge_file_path):
        util.info("%s exists and won't be overwritten. Skipping..." %
                  merge_file_path)

    else:
        temp_file_path_a = util.get_temp_path(merge_file_path)
        temp_file_path_b = util.get_temp_path(merge_file_path)

        # Make regions for parallelisation, splitting all chromos according to number of CPUs

        chromo_sizes = util.get_bam_chromo_sizes(bam_file_paths[0])

        regions = []
        region_fmt = '%s:%d-%d'

        for chromo, size in chromo_sizes:
            step = int(size / num_cpu) + 1  # will be rounded up

            i = 0
            j = step

            while j < size:
                regions.append(region_fmt % (chromo, i, j))
                i = j
                j += step

            regions.append(region_fmt % (chromo, i, size))

        # Call haplotype for all strains at once, split into parallel regions

        common_args = [genome_fasta_path, bam_file_paths]
        region_vcf_paths = util.parallel_split_job(freebayes_genotype_job,
                                                   regions,
                                                   common_args,
                                                   num_cpu,
                                                   collect_output=True)

        # Combine the regions which were run in parallel

        util.info('Combining freebayes regions')
        out_file_obj = open(temp_file_path_a, 'w')
        write = out_file_obj.write

        for i, region_vcf in enumerate(region_vcf_paths):
            with open(region_vcf) as file_obj:
                for line in file_obj:
                    if '\n' in line:
                        if line[0] == '#':
                            if i == 0:
                                write(line)

                        else:
                            write(line)

                    else:
                        util.critical('No end of line in %s. Exiting...' %
                                      region_vcf)

        out_file_obj.close()
        cmd_args = [exe.EXE['vcfuniq']]
        util.call(cmd_args, stdin=temp_file_path_a, stdout=merge_file_path)

        # Cleanup temp files

        os.unlink(temp_file_path_a)

        for file_path in region_vcf_paths:
            os.unlink(file_path)

    return merge_file_path
Beispiel #10
0
def genome_map(aligner,
               strain_name,
               strain_num,
               fastq_paths,
               genome_index_path,
               genome_fasta_path,
               num_cpu=util.MAX_CORES):

    dir_name, base_name = os.path.split(fastq_paths[0])

    path_root = os.path.join(dir_name, strain_name)
    sam_file_path = '%s.sam' % path_root

    if os.path.exists(sam_file_path):
        util.info("SAM file %s already exists. Skipping genome mapping" %
                  sam_file_path)

    else:
        util.info("Running aligner %s on %s..." % (aligner, strain_name))

        if aligner == ALIGNER_BWA:
            rg_header = "@RG\\tID:%s\\tSM:sample_%s\\tPL:illumina\\tLB:lib%d\\tPU:unit%d" % (
                strain_name, strain_name, strain_num, strain_num)
            cmd_args = [
                exe.EXE[ALIGNER_BWA],
                'mem',
                '-t',
                str(num_cpu),
                '-M',
                '-R',
                rg_header,
                #genome_index_path] + list(fastq_paths)
                genome_fasta_path
            ] + list(fastq_paths)
            util.call(cmd_args, stdout=open(sam_file_path, 'w'))

        elif aligner == ALIGNER_BT2:
            cmd_args = [
                exe.EXE[ALIGNER_BT2],
                '--sensitive',
                '-x',
                genome_index_path,
                '-p',
                str(num_cpu),
                '-q',  # FASTQ input
                '--rg-id',
                strain_name,
                '--rg',
                "SM:sample_%s\tPL:illumina\tLB:lib%d\tPU:unit%d" %
                (strain_name, strain_num, strain_num),
                '-S',
                sam_file_path
            ]

            if len(fastq_paths) > 1:
                cmd_args += ['-1', fastq_paths[0], '-2', fastq_paths[1]]
            else:
                cmd_args += ['-U', fastq_paths[0]]

            util.call(cmd_args)

        else:  # bbmap
            cmd_args = [
                exe.EXE[ALIGNER_BBMAP],
                'ref=%s' % genome_fasta_path,
                'path=%s' % genome_index_path, 'sam=1.3',
                'in=%s' % fastq_paths[0],
                'out=%s' % sam_file_path,
                't=%d' % num_cpu,
                'rgid=%s' % strain_name,
                'rgsm=sample_%s' % strain_name, 'rgpl=illumina',
                'rglb=lib%d' % strain_num,
                'rgpu=unit%d' % strain_num
            ]

            if len(fastq_paths) > 1:
                cmd_args += ['in2=%s' % fastq_paths[1]]

            util.call(cmd_args)

    util.info('Done %s genome alignment for strain %s' %
              (aligner, strain_name))

    return sam_file_path
Beispiel #11
0
def bedtools_coverage(bam_file_path, genome_fasta_path, exon_gff_file_path):

    dir_name, base_name = os.path.split(bam_file_path)
    file_root = os.path.splitext(base_name)[0]
    dir_name = os.path.join(dir_name, 'coverage')

    util.makedirs(
        dir_name,
        exist_ok=True)  # Not the os version to be Python 2 and 3 compatible

    genome_cvr_file_path = os.path.join(dir_name, base_name + '.genomecov')
    exon_cvr_file_path = os.path.join(dir_name, base_name + '_exon.coverage')
    exon_cvr_temp_file_path = os.path.join(dir_name,
                                           base_name + '_exon.coverage.temp')
    R_cvr_file_path = os.path.join(dir_name, base_name + '_R_coverage.out')

    if os.path.exists(R_cvr_file_path):
        util.info(
            "Coverage file %s already exists. Skipping coverage calculations" %
            (R_cvr_file_path, ))
        return

    bedtools_exe = exe.EXE['bedtools']

    util.info("Running bedtools genomecov...")

    #  cmd_args = [bedtools_exe, 'genomecov', '-ibam', bam_file_path, '-g', genome_fasta_path]
    cmd_args = [bedtools_exe, 'genomecov', '-pc', '-ibam', bam_file_path]
    util.call(cmd_args, stdout=genome_cvr_file_path)

    util.info("Done... Results saved in: %s" % genome_cvr_file_path)
    util.info("Converting %s into a sorted bed file..." % bam_file_path)

    temp_dir = os.path.join(dir_name, 'TEMP_%s' % uuid.uuid4())
    os.makedirs(temp_dir)

    temp_bed_file1 = os.path.join(temp_dir, '%s.bed' % file_root)
    temp_bed_file2 = os.path.join(temp_dir, '%s_sortBed.bed' % file_root)

    cmd_args = [bedtools_exe, 'bamtobed', '-i', bam_file_path]
    util.call(cmd_args, stdout=temp_bed_file1)

    #  cmd_args = [bedtools_exe, 'sort', '-i', temp_bed_file1]
    cmd_args = ['sort', '-k1,1', '-k2,2n', '--batch-size=5',
                temp_bed_file1]  # Update to reduce RAM usage
    util.call(cmd_args, stdout=temp_bed_file2)

    util.info("Done... Results saved in temporary directory: %s" % temp_dir)
    util.info("Running bedtools coverage...")

    #  cmd_args = [bedtools_exe, 'coverage', '-hist' ,'-a', exon_gff_file_path,'-b', temp_bed_file2]
    cmd_args = [
        bedtools_exe, 'coverage', '-sorted', '-hist', '-a', exon_gff_file_path,
        '-b', temp_bed_file2
    ]  # Update to reduce RAM usage
    util.call(cmd_args, stdout=exon_cvr_file_path)

    util.info("Done... Results saved in: %s" % exon_cvr_file_path)

    # In order to calculate exon coverage in R, we need to extract all lines starting with "all" from exon.coverage file
    # This file is saved in a temporary directory

    cmd_args = ['grep', 'all', exon_cvr_file_path]
    util.call(cmd_args, stdout=exon_cvr_temp_file_path)

    util.info(
        "Running R to compute mean genome coverage and mean exon coverage..")

    cmd_args = [
        'Rscript', '--vanilla', exe.EXE['mgcr'], genome_cvr_file_path,
        exon_cvr_temp_file_path
    ]
    util.call(cmd_args, stdout=R_cvr_file_path)

    util.info("Delete temporary directory and files...")
Beispiel #12
0
def sam_cleanup(sam_file_path, num_cpu=2):

    file_tag = util.FILE_TAG

    path_root, file_ext = os.path.splitext(sam_file_path)
    strain_name = os.path.basename(path_root)

    bam_file_path = '%s%ssrt.bam' % (path_root, file_tag)
    clean_bam_path = '%s%ssrt_%s.bam' % (path_root, file_tag, CLEAN_TAG)
    out_bam_path = '%s%ssrt_%s_%s.bam' % (path_root, file_tag, CLEAN_TAG,
                                          PICARD_TAG)
    metrics_file_path = '%s%ssrt_%s_%s_metrics.txt' % (path_root, file_tag,
                                                       CLEAN_TAG, PICARD_TAG)

    if os.path.exists(out_bam_path):
        util.info("BAM file %s already exists. Skipping SAM cleanup" %
                  out_bam_path)
        return out_bam_path

    util.info(
        "Converting SAM file from genome aligner output into sorted BAM...")

    cmd_args = [
        exe.EXE['samtools'],
        'sort',
        '-O',
        'bam',
        #             '-@', str(num_cpu),
        '-o',
        bam_file_path,
        sam_file_path
    ]

    util.call(cmd_args)

    util.info(
        'Removing unmapped reads, PCR duplicates and low quality ones (MAPQ smaller than 1) keeping only paired reads which are properly mapped...'
    )  # Log strains individually

    cmd_args = [
        'samtools', 'view', '-b', '-f', '3', '-F', '4', '-q', '1',
        bam_file_path
    ]
    util.call(cmd_args, stdout=open(clean_bam_path, 'wb'))

    util.info("Marking duplicate reads using Picard")

    cwd = os.getcwd()
    os.chdir('/')  # Picard picky about relative paths

    cmd_args = list(util.JAVA)
    cmd_args += [
        '-jar', exe.EXE['picard'], 'MarkDuplicatesWithMateCigar',
        'I=%s' % clean_bam_path,
        'O=%s' % out_bam_path,
        'M=%s' % metrics_file_path
    ]

    util.call(cmd_args)

    os.chdir(cwd)

    util.info("Indexing %s" % out_bam_path)
    util.call([exe.EXE['samtools'], 'index', out_bam_path])

    util.info('Done BAM clean-up for strain %s' % strain_name)

    return out_bam_path
Beispiel #13
0
 def execute_CAM(self):
     """
 This function collects variables and starts process
 """
     if self.csv_opt == "Create":
         self.csv_file = self.csv_create.csv_file
         self.contrast = 'Condition'
     elif self.csv_opt == "Upload":
         self.csv_file = self.csv_upload.csv_file
         fileObj = open(self.csv_file, 'r')
         line = fileObj.readline()
         line = line.rstrip('\n')
         line = line.split('\t')
         self.contrast = line[3]
         fileObj.close()
     else:
         show_error_message('Please provide a samples file.')
         self.csv_file = None
         self.contrast = None
     self.soft = self.soft_opt.selected
     self.lib = self.lib_opt.selected
     self.seq = self.seq_opt.selected
     self.al = self.al_opt.selected
     self.fa_file = self.fa_file_frame.lbox.text()
     self.tgalore_args = self.tgalore.lbox.text()
     self.fastqc_args = self.fastqc.lbox.text()
     self.al_args = self.aligner.lbox.text()
     self.cpu_args = self.cpu.lbox.text()
     self.flags = []
     # Arguments to run CAM
     args = [self.csv_file, self.fa_file]
     soft_dict = {'MAGeCK': 'mageck', 'Bagel': 'bagel'}
     dict_aux = {'Bowtie': 'bowtie', 'Bowtie2': 'bowtie2'}
     dict_guides = {'Bassik': 'bassik', 'Other': 'other'}
     dict_args = {
         'al': dict_aux[self.al],
         'crispr_software': soft_dict[self.soft],
         'guide_library': dict_guides[self.lib]
     }
     if len(self.tgalore_args) > 0:
         dict_args['trim_galore'] = '"%s"' % self.tgalore_args
     if len(self.fastqc_args) > 0:
         dict_args['fastqc_args'] = '"%s"' % self.fastqc_args
     if len(self.al_args) > 0:
         dict_args['aligner_args'] = '"%s"' % self.al_args
     if self.seq == 'single-end':
         self.flags.append('-se')
     if len(self.cpu_args) > 0:
         dict_args['cpu'] = self.cpu_args
     for key, item in dict_args.items():
         key = '-' + key
         aux = '='.join([key, str(item)])
         args.append(aux)
     args += self.flags
     # Run CAM on the LMB cluster as a qsub job
     if self.qsub.isChecked():
         if self.seq == 'paired-end':
             args = args + ['-pe', self.pe_tags]
         command = ' '.join(args)
         command = 'module load python3/3.7.1\nmodule load multiqc\npython3 /net/nfs1/public/genomics/CAM/CAM.py %s ' % (
             command)
         temp = 'job_' + util.get_rand_string(5) + ".sh"
         tempObj = open(temp, 'w')
         tempObj.write(command)
         tempObj.close()
         qsubArgs = ['qsub', '-cwd', '-j', 'y', '-V']
         if self.node.isChecked():
             qsubArgs = qsubArgs + ['-l', 'dedicated=24', temp]
         else:
             if len(self.cpu_args) > 0:
                 cpu = dict_args['cpu']
             else:
                 cpu = '4'
             qsubArgs = qsubArgs + ['-pe', 'smp', cpu, temp]
         util.call(qsubArgs)
         show_pop_up(msg='Job submitted to LMB cluster!')
         os.remove(temp)
     # Run CAM on local machine
     else:
         if self.seq == 'paired-end':
             args = args + ['-pe'] + self.pe_tags.split(' ')
         CAM = '%s/CAM.py' % os.path.dirname(os.path.realpath(__file__))
         args = ['python3', CAM] + args
         util.call(args, shell=True)
Beispiel #14
0
def run_aligner(trimmed_fq,fastq_dirs,aligner='bowtie2',guide_library='bassik',reference_fasta=None,genome_index=None,num_cpu=util.MAX_CORES, is_single_end=True,pair_tags=['r_1','r_2'],aligner_args=None,convert_to_bam=True):
  # Generate genome indexes if not provided
  if aligner == 'bowtie':
    genome_index_default = os.path.dirname(reference_fasta) + '/bt-genome/'
    index_builder = 'bowtie-build'
  elif aligner == 'bowtie2':
    genome_index_default = os.path.dirname(reference_fasta) + '/bt2-genome/'
    index_builder = 'bowtie2-build'
  if aligner in [ 'bowtie', 'bowtie2']:
    if genome_index is None:
      genome_index = genome_index_default
      util.warn('Folder where %s indices are located hasn\'t been specified. Program will default to %s...' % (aligner,genome_index))
      base = os.path.basename(reference_fasta).split('.')[:-1]
      base = '.'.join(base)
      base = genome_index + base
      if not os.path.exists(genome_index):
        os.mkdir(genome_index)
        util.info('Bowtie2 indices not found. Generating indices...')
        cmdArgs = [index_builder,reference_fasta,base]
        util.call(cmdArgs)
      genome_index = base
    
    # Alignment
    util.info('Aligning reads using %s...' % aligner)
    
    def format_aligner_input(trimmed_fq,aligner,aligner_args,is_single_end,convert_to_bam):
      if aligner == 'bowtie':
        ext = 'bt'
      elif aligner == 'bowtie2':
        ext = 'bt2'
      k = 0 
      if is_single_end:
        sam_log_list = []
        for f in trimmed_fq:
          cmdArgs = [aligner] + aligner_args
          fo = os.path.basename(f)
          fo = fastq_dirs[k]+ '/' + fo
          sam = fo + '.%s.sam' % ext
          log = fo + '.%s.log' % ext
          sam_log_list.append([f,sam,log])
        return(sam_log_list)
    
    if aligner == 'bowtie':
      if convert_to_bam:
        sam_args = ['-S','--no-unal']
      else:
          sam_args = []
      if aligner_args is None:
        aligner_args = ['-v', '0', '-m', '1', '--strata', '--best'] # allow no mismatches and report reads that align only once
        if guide_library == 'bassik':
          aligner_args = aligner_args + ['-5','1']
      sam_log_list = format_aligner_input(trimmed_fq=trimmed_fq,aligner=aligner,aligner_args=aligner_args,is_single_end=is_single_end,convert_to_bam=convert_to_bam)
      file_list = []
      for f, sam , log in sam_log_list:
        if convert_to_bam:
          wd = os.path.dirname(sam)
          sam_header = os.path.basename(sam).split('.')[:-1]
          sam_header = '.'.join(sam_header)
          check_exists = wd + '/' + sam_header + '.bam'
        else:
          check_exists = sam
        file_list.append(sam)
        if pragui.exists_skip(check_exists):
        # Function pragui.exists_skip() determines if next step should go ahead.
        # It skips the next step if the file path provided exists.
        # This prevents overwriting files and also saves processing time.
          cmdArgs = [aligner] + aligner_args + ['-p',str(num_cpu), genome_index,f] + sam_args + [sam]
          util.call(cmdArgs,stderr=log)
          
    if aligner == 'bowtie2':
      if convert_to_bam:
          header_opt = []
      else:
        header_opt = ['--no-hd']
      if aligner_args is None:
        # Allow no mismatches and no pre-alignment before multiseed heuristic
        aligner_args = ['-N','0','--no-1mm-upfront','--score-min', 'L,0,0', '--no-unal'] + header_opt
        if guide_library == 'bassik':
          aligner_args = aligner_args + ['-5','1']
      sam_log_list = format_aligner_input(trimmed_fq=trimmed_fq,aligner=aligner,aligner_args=aligner_args,is_single_end=is_single_end,convert_to_bam=convert_to_bam)
      file_list = []
      for f, sam , log in sam_log_list:
        if convert_to_bam:
          wd = os.path.dirname(sam)
          sam_header = os.path.basename(sam).split('.')[:-1]
          sam_header = '.'.join(sam_header)
          check_exists = wd + '/' + sam_header + '.bam'
        else:
          check_exists = sam
        file_list.append(sam)
        if pragui.exists_skip(check_exists):
          cmdArgs = [aligner] + aligner_args + ['-p',str(num_cpu),'-x', genome_index,'-U', f, '-S', sam]
          util.call(cmdArgs,stderr=log)
            
    # Convert sam to bam
    if convert_to_bam is True:
      file_list = []
      for f, sam , log in sam_log_list:
        wd = os.path.dirname(sam)
        sam_header = os.path.basename(sam).split('.')[:-1]
        sam_header = '.'.join(sam_header)
        check_exists = wd + '/' + sam_header + '.bam'
        if pragui.exists_skip(check_exists):
          file = convert_sam_to_bam(sam=sam)
        else:
          file = check_exists
        file_list.append(file)
    
    return(file_list)
def cross_fil_background(strain_vcf_files, out_vcf_path=None, min_num_obs=3):
    
  for file_path in strain_vcf_files:
    is_ok, msg = util.check_regular_file(file_path)
  
    if not is_ok:
      util.critical(msg)
  
  if not out_vcf_path:
    out_vcf_path = 'bg_s%d_m%d.vcf' % (len(strain_vcf_files), min_num_obs)
  
  file_root, file_ext = os.path.splitext(out_vcf_path)
  
  comb_vcf_path = '%s_comb_input.vcf' % file_root
  temp_comb_vcf_path = util.get_temp_path(comb_vcf_path)
  
  util.info('Creating background VCF file for %d input files' % (len(strain_vcf_files)))
  
  # Combine each strain's diploid variants into a combined VCF
  # vcfcombine: Combine multiple VCF files together, handling samples when alternate allele descriptions are identical
  # vcfintersect -u any better?
   
  cmd_args = [exe.EXE['vcfcombine']]
  cmd_args += strain_vcf_files
  util.call(cmd_args, stdout=temp_comb_vcf_path)
  
  # Check chromosome sorting 
  
  cmd_args = [exe.EXE['vcfstreamsort'], '-a']
  util.call(cmd_args, stdin=temp_comb_vcf_path, stdout=comb_vcf_path)
  
  os.unlink(temp_comb_vcf_path)
  
  # Filter on number of samples represented in the genotype fields
  # i.e. for vars that occur at least a given number of times across strainss
  
  out_file_obj = open(out_vcf_path, 'w')
  write = out_file_obj.write
  num_samples = None # Filled from header info
  
  with open(comb_vcf_path) as file_obj:
    for line in file_obj:
      if line[0] == '#':
        if line[1:6] == 'CHROM':
          header = line.split()
          
          if len(header) < 10:
            util.critical('Cannot filter sample genotypes in a VCF file without FORMAT and sample/genotype information')
           
          else: 
            sample_names = header[9:] #Could use in future to track which strains are selected
            num_samples = len(sample_names)
            
        write(line)
      
      else:
        data = line.split()      
        genotypes = data[9:]
        if ref_allele is None:
          num_obs = num_samples - genotypes.count('.')
        else:
          genotypes2 = []
          for r in range(len(ref_allele)):
            genotypes2 = genotypes2 + [ref_allele[r] in x for x in genotypes]
          num_obs = num_samples - genotypes2.count(True)
        
        if num_obs >= min_num_obs:
          write(line)      
  
  util.info('Background VCF file output at "%s"' % (out_vcf_path, ))
Beispiel #16
0
def subtract_background(strain_vcf_path, background_vcf_path,
                        genome_fasta_path, out_dir, genome_version,
                        interval_length, output_tag):

    bg_path_root, bg_file_ext = os.path.splitext(background_vcf_path)
    bg_file_root = os.path.basename(bg_path_root)

    util.info('Filtering VCF file %s' % strain_vcf_path)

    path_root, file_ext = os.path.splitext(strain_vcf_path)

    if out_dir:
        file_root = os.path.basename(path_root)
        path_root = os.path.join(out_dir, file_root)

    path_root = '%s%s%s' % (
        path_root, output_tag, bg_file_root
    )  # Combines background name and sample/strain name

    out_vcf_path = path_root + '.vcf'
    out_vcf_path = util.get_safe_file_path(out_vcf_path)  # Avoid overwrites
    path_root, file_ext = os.path.splitext(
        out_vcf_path
    )  # Path root should have changed if a substitute name was used

    snpeff_vcf_path = path_root + '_SnpEff.vcf'
    snpeff_summ_path = path_root + '_summary.html'
    snpsift_tab_path = path_root + '_SnpSift.tabular'

    cmd_args = list(util.JAVA) + [
        '-jar',
        exe.EXE['gatk'],
        '-T',
        'SelectVariants',
        '-R',
        genome_fasta_path,
        '-V',
        strain_vcf_path,
        '--discordance',
        background_vcf_path,
        '-o',
        out_vcf_path,
    ]

    util.call(cmd_args)

    util.info('Running SnpEff on %s' % out_vcf_path)

    # Run SnpEff on resulting VCF file

    cmd_args = list(util.JAVA) + [
        '-jar', exe.EXE['snpeff'], '-v', '-upDownStreamLen',
        str(interval_length), '-stats', snpeff_summ_path, genome_version,
        out_vcf_path
    ]

    util.call(cmd_args, stdout=snpeff_vcf_path)

    # Create tabular output from VCF file using SnpSift

    util.info('Running SnpSift on %s' % snpeff_vcf_path)

    cmd_args = list(util.JAVA) + [
        '-jar', exe.EXE['snpsift'], 'extractFields', snpeff_vcf_path, '-s',
        ',', '-e', '.', 'CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'DP',
        'ANN[*].ERRORS', 'ANN[*].GENEID', 'ANN[*].GENE', 'ANN[*].BIOTYPE',
        'ANN[*].TRID', 'ANN[*].RANK', 'ANN[*].EFFECT', 'ANN[*].IMPACT:',
        'ANN[*].HGVS_P', 'ANN[*].HGVS_C', 'ANN[*].CDS_POS', 'ANN[*].CDS_LEN',
        'ANN[*].DISTANCE'
    ]

    util.call(cmd_args, stdout=snpsift_tab_path)

    util.info('Results saved to %s and similarly named analysis files' %
              out_vcf_path)