Exemple #1
0
def sam_parser_parallel(file_list, convert_to_bam,aligner,num_cpu=util.MAX_CORES, remove_sam = True):
  
  util.info('Parsing sam files to get guide counts...')
  
  def sam_parser(sam_file,aligner,remove_sam = True):
    ext = '.sam'
    if '.bam' in sam_file:
      ext = '.bam'
    counts_file = sam_file.strip(ext) + '_lib_guidecounts.txt'
    counts_log = sam_file.strip(ext) + '_lib_guidecounts.log'
    sam_parser_to_guide_counts = os.path.dirname(os.path.realpath(__file__)) + '/sam_parser_to_guide_counts.sh'
    if convert_to_bam:
      util.info('Removing sam header from %s in order to proceed to read counting...' % sam_file)
      # Remove header
      temp = sam_file.strip(ext) + '_temp.sam'
      # Remove unaligned reads if there are any. 
      # This is particularly important when using bowtie because the --no-unal flag doesn't really work.
      cmdArgs = ['samtools','view','-F','4',sam_file,'-o',temp] 
      util.call(cmdArgs)
      util.info('Counting reads from %s...' % sam_file)
      cmdArgs = [sam_parser_to_guide_counts,temp,counts_file,aligner]
      util.call(cmdArgs,stderr=counts_log)
      os.remove(temp)
    else:
      util.info('Counting reads from %s...' % sam_file)
      cmdArgs = [sam_parser_to_guide_counts,sam_file,counts_file,aligner]
      util.call(cmdArgs,stderr=counts_log)
    if remove_sam is True and ext is '.sam':
      os.remove(sam_file)
    return(counts_file)
 
  common_args=[aligner,remove_sam]
  counts_file_list = util.parallel_split_job(sam_parser,file_list,common_args,num_cpu)
  return(counts_file_list)
def gatk_merge_vcfs(dir_name,
                    strain_vcf_paths,
                    genome_fasta_path,
                    num_cpu=util.MAX_CORES):

    merge_file_path = _get_merged_vcf_path(dir_name, strain_vcf_paths.keys(),
                                           CALLER_GATK)

    if os.path.exists(merge_file_path):
        util.info('%s already exists and won\'t be overwritten...' %
                  merge_file_path)

    else:
        cmd_args = list(util.JAVA) + [
            '-jar',
            exe.EXE[CALLER_GATK],
            '-T',
            'GenotypeGVCFs',
            '-R',
            genome_fasta_path,
            #'-nt', str(min(8, num_cpu)), # Seems to fail with multiple CPU threads...
            '-o',
            merge_file_path
        ]

        for strain in strain_vcf_paths:
            cmd_args += ['-V', strain_vcf_paths[strain]]

        util.call(cmd_args)

    return merge_file_path
Exemple #3
0
def convert_sam_to_bam(sam):
  util.info('Converting %s to bam format so to save disk space...' % sam) 
  bam_file = sam.strip('.sam') + '.bam'
  cmdArgs = ['samtools','view','-bh',sam,'-o',bam_file]
  util.call(cmdArgs)
  os.remove(sam)
  return(bam_file)
def _get_merged_vcf_path(dir_name, strain_names, tag):

    sort_strains = sorted(strain_names)

    if len(strain_names) < 7:
        strain_text = '_'.join(sort_strains)
        merge_file_path = os.path.join(
            dir_name, 'merged_%s_comb_%s.vcf' % (strain_text, tag))

    else:
        num_strains = len(sort_strains)
        merge_file_path = os.path.join(
            dir_name, 'merged_%d_comb_%s.vcf' % (num_strains, tag))

    util.info('Strains to be combined:\n%s\n' % ' '.join(sort_strains))
    """
  if os.path.exists(merge_file_path):
    util.warn('%s already exists and won\'t be overwritten...' % merge_file_path)
    
    i = 0
    merge_file_path = '%s_%03d.vcf' % (merge_file_path[:-4], i)
    
    while os.path.exists(merge_file_path):
      i += 1
      merge_file_path = '%s_%03d.vcf' % (merge_file_path[:-8], i)
      
    util.info('Results will be saved in %s' % merge_file_path)
 
  """

    return merge_file_path
def call_genotype_gatk(strain_bam_paths, genome_fasta_path, num_cpu, out_dir,
                       sub_dir_name):
    # GATK pipeline - parallelise strains in python

    genome_index_file = genome_fasta_path + '.fai'
    genome_dict_file = os.path.splitext(genome_fasta_path)[0] + '.dict'

    if not os.path.exists(genome_index_file):
        util.info('Making index for genome FASTA file %s' % genome_fasta_path)
        cmd_args = [exe.EXE['samtools'], 'faidx', genome_fasta_path]
        util.call(cmd_args)

    if not os.path.exists(genome_dict_file):
        util.info('Making dict filr for genome FASTA file %s' %
                  genome_fasta_path)
        cmd_args = [
            exe.EXE['samtools'], 'dict', genome_fasta_path, '-o',
            genome_dict_file
        ]
        util.call(cmd_args)

    strains = sorted(strain_bam_paths)

    bam_paths = [strain_bam_paths[s] for s in strains
                 ]  # Each parallel call will be sent one of these
    common_args = [genome_fasta_path, sub_dir_name]  # All tasks share this

    vcf_paths = util.parallel_split_job(gatk_haplotype_job,
                                        bam_paths,
                                        common_args,
                                        num_cpu,
                                        collect_output=True)
    # BAM and VCF path will be in corresponding order

    # Multi-sample
    strain_vcf_paths = {strains[i]: p for i, p in enumerate(vcf_paths)}
    merged_vcf_path = gatk_merge_vcfs(out_dir,
                                      strain_vcf_paths,
                                      genome_fasta_path,
                                      num_cpu=num_cpu)

    return merged_vcf_path
Exemple #6
0
def cross_fil_subtract(strain_vcf_paths,
                       background_vcf_path,
                       genome_fasta_path,
                       genome_version,
                       out_dir=None,
                       num_cpu=util.MAX_CORES,
                       interval_length=DEFAULT_INTERVAL_LENGTH,
                       output_tag=OUTPUT_TAG):

    # Accepts multiple inputs, which can all be done in parallel
    # This function is just a parallelisation wrapper for subtract_background()

    common_args = [
        background_vcf_path, genome_fasta_path, genome_version, out_dir,
        interval_length, output_tag
    ]

    util.parallel_split_job(subtract_background, strain_vcf_paths, common_args,
                            num_cpu)

    util.info('cross_fil_subtract done!')
def gatk_haplotype_job(bam_file_path, genome_fasta_path, sub_dir_name):

    path_root, file_ext = os.path.splitext(bam_file_path)
    dir_name, file_root = os.path.split(path_root)

    vcf_dir_name = os.path.join(dir_name, sub_dir_name)
    # vcf_file_path  = os.path.join(vcf_dir_name, '%s_hap.vcf' % (file_root))
    gvcf_file_path = os.path.join(vcf_dir_name, '%s_hap.g.vcf' % (file_root))

    if os.path.exists(gvcf_file_path):
        util.info(
            "VCF file %s already exists. Skipping haplotype calling for %s" %
            (gvcf_file_path, file_root))
        return gvcf_file_path

    util.makedirs(vcf_dir_name, exist_ok=True)

    util.info('Creating GVCF file for %s using GATK' % file_root)

    cmd_args = list(util.JAVA) + [
        '-jar',
        exe.EXE[CALLER_GATK],
        '-T',
        'HaplotypeCaller',
        '-R',
        genome_fasta_path,
        '-I',
        bam_file_path,
        '-o',
        gvcf_file_path,
        '-ERC',
        'GVCF',
        '-variant_index_type',
        'LINEAR',  # Deprecated for GATK 4.0
        '-variant_index_parameter',
        '128000'
    ]  # Deprecated for GATK 4.0
    util.call(cmd_args)

    return gvcf_file_path
Exemple #8
0
def plot_coverage(coverage_files,output=None):
  outdir = coverage_files[0]
  outdir = outdir.split("/")
  outdir = outdir[0:-2]
  outdir = "/".join(outdir)
  if output is None:
    output = util.get_rand_string(8)
    util.info("Header for output files has not been specified. Random string %s will be used instead...") % output
  all_strains_cov = "%s/%s_all_strains_cov.txt" % (outdir,output)
  fileObj1 = open(all_strains_cov,"w")

  fileObj1.write("Genome_cov\tExon_cov\n")

  for f in coverage_files:
    strain = f.split("/")
    strain = strain[-1]
    strain = strain.split("_")
    strain = strain[0]
    fileObj = open(f,"r")
    line1 =  fileObj.readline()
    line1 = line1.split(" ")
    genome_cov = line1[4]
    genome_cov = genome_cov.rstrip()
    line2 = fileObj.readline()
    line2 = line2.split(" ")
    exon_cov = line2[7]
    exon_cov = exon_cov.rstrip()
    towrite = "\t".join([strain,genome_cov,exon_cov]) + "\n"
    fileObj1.write(towrite)
    fileObj.close()

  fileObj1.close()
  
  util.info("File saved as %s..." % all_strains_cov)
  
  cmdArgs = ['Rscript','--vanilla',  exe.EXE['phc'],
  all_strains_cov,output]
  
  util.call(cmdArgs)
def gatk_select_vars(strain_name,
                     merged_vcf_path,
                     genome_fasta_path,
                     file_tag='extracted',
                     homozygous=True):

    dir_name, file_name = os.path.split(merged_vcf_path)

    out_vcf_path = os.path.join(dir_name,
                                '%s_%s.vcf' % (strain_name, file_tag))
    # Original naming: "%s_sorted_f3_F4_q1_mark_dups_w_mate_cig_gatk_hap_call_extracted.vcf" % strain_name

    util.info('Creating VCF file for %s' % strain_name)

    cmd_args = list(util.JAVA) + [
        '-jar', exe.EXE[CALLER_GATK], '-T', 'SelectVariants', '-R',
        genome_fasta_path, '-V', merged_vcf_path, '-o', out_vcf_path, '-sn',
        'sample_%s' % strain_name
    ]

    if homozygous:
        cmd_args += [
            '-select',
            "vc.getGenotype('sample_%s').isHomVar()" % strain_name
        ]  # Check quotes

    else:
        # cmd_args += ['-select', "! vc.getGenotype('sample_%s').isHomRef()" % strain_name]
        cmd_args += [
            '-select',
            "vc.getGenotype('sample_%s').isHomVar() || vc.getGenotype('sample_%s').isHet() && ! vc.getGenotype('sample_%s').isHomRef()"
            % (strain_name, strain_name, strain_name)
        ]

    util.call(cmd_args)
    util.info('All done for strain %s. VCF file can be found in %s' %
              (strain_name, out_vcf_path))
Exemple #10
0
def sam_cleanup(sam_file_path, num_cpu=2):

    file_tag = util.FILE_TAG

    path_root, file_ext = os.path.splitext(sam_file_path)
    strain_name = os.path.basename(path_root)

    bam_file_path = '%s%ssrt.bam' % (path_root, file_tag)
    clean_bam_path = '%s%ssrt_%s.bam' % (path_root, file_tag, CLEAN_TAG)
    out_bam_path = '%s%ssrt_%s_%s.bam' % (path_root, file_tag, CLEAN_TAG,
                                          PICARD_TAG)
    metrics_file_path = '%s%ssrt_%s_%s_metrics.txt' % (path_root, file_tag,
                                                       CLEAN_TAG, PICARD_TAG)

    if os.path.exists(out_bam_path):
        util.info("BAM file %s already exists. Skipping SAM cleanup" %
                  out_bam_path)
        return out_bam_path

    util.info(
        "Converting SAM file from genome aligner output into sorted BAM...")

    cmd_args = [
        exe.EXE['samtools'],
        'sort',
        '-O',
        'bam',
        #             '-@', str(num_cpu),
        '-o',
        bam_file_path,
        sam_file_path
    ]

    util.call(cmd_args)

    util.info(
        'Removing unmapped reads, PCR duplicates and low quality ones (MAPQ smaller than 1) keeping only paired reads which are properly mapped...'
    )  # Log strains individually

    cmd_args = [
        'samtools', 'view', '-b', '-f', '3', '-F', '4', '-q', '1',
        bam_file_path
    ]
    util.call(cmd_args, stdout=open(clean_bam_path, 'wb'))

    util.info("Marking duplicate reads using Picard")

    cwd = os.getcwd()
    os.chdir('/')  # Picard picky about relative paths

    cmd_args = list(util.JAVA)
    cmd_args += [
        '-jar', exe.EXE['picard'], 'MarkDuplicatesWithMateCigar',
        'I=%s' % clean_bam_path,
        'O=%s' % out_bam_path,
        'M=%s' % metrics_file_path
    ]

    util.call(cmd_args)

    os.chdir(cwd)

    util.info("Indexing %s" % out_bam_path)
    util.call([exe.EXE['samtools'], 'index', out_bam_path])

    util.info('Done BAM clean-up for strain %s' % strain_name)

    return out_bam_path
def call_genotype_freebayes(strain_bam_paths, genome_fasta_path, num_cpu,
                            out_dir, sub_dir_name):
    # FreeBayes pipeline

    strain_names, bam_file_paths = zip(*list(strain_bam_paths.items()))

    merge_file_path = _get_merged_vcf_path(out_dir, bam_file_paths,
                                           CALLER_FREEBAYES)

    if os.path.exists(merge_file_path):
        util.info("%s exists and won't be overwritten. Skipping..." %
                  merge_file_path)

    else:
        temp_file_path_a = util.get_temp_path(merge_file_path)
        temp_file_path_b = util.get_temp_path(merge_file_path)

        # Make regions for parallelisation, splitting all chromos according to number of CPUs

        chromo_sizes = util.get_bam_chromo_sizes(bam_file_paths[0])

        regions = []
        region_fmt = '%s:%d-%d'

        for chromo, size in chromo_sizes:
            step = int(size / num_cpu) + 1  # will be rounded up

            i = 0
            j = step

            while j < size:
                regions.append(region_fmt % (chromo, i, j))
                i = j
                j += step

            regions.append(region_fmt % (chromo, i, size))

        # Call haplotype for all strains at once, split into parallel regions

        common_args = [genome_fasta_path, bam_file_paths]
        region_vcf_paths = util.parallel_split_job(freebayes_genotype_job,
                                                   regions,
                                                   common_args,
                                                   num_cpu,
                                                   collect_output=True)

        # Combine the regions which were run in parallel

        util.info('Combining freebayes regions')
        out_file_obj = open(temp_file_path_a, 'w')
        write = out_file_obj.write

        for i, region_vcf in enumerate(region_vcf_paths):
            with open(region_vcf) as file_obj:
                for line in file_obj:
                    if '\n' in line:
                        if line[0] == '#':
                            if i == 0:
                                write(line)

                        else:
                            write(line)

                    else:
                        util.critical('No end of line in %s. Exiting...' %
                                      region_vcf)

        out_file_obj.close()
        cmd_args = [exe.EXE['vcfuniq']]
        util.call(cmd_args, stdin=temp_file_path_a, stdout=merge_file_path)

        # Cleanup temp files

        os.unlink(temp_file_path_a)

        for file_path in region_vcf_paths:
            os.unlink(file_path)

    return merge_file_path
Exemple #12
0
def cat_fastq(barcode_csv,
              fastq_paths_r1,
              fastq_paths_r2=None,
              out_top_dir=None,
              sub_dir_name=None,
              file_ext=None):

    if not sub_dir_name:
        sub_dir_name = 'strain'

    if fastq_paths_r2:
        fastq_paths_r2_list = fastq_paths_r2
    else:
        fastq_paths_r2_list = []

    for file_path in [barcode_csv
                      ] + fastq_paths_r1 + fastq_paths_r2_list or []:
        is_ok, msg = util.check_regular_file(file_path)

        if not is_ok:
            util.critical(msg)

    if not file_ext:
        file_ext = util.get_file_ext(fastq_paths_r1[0])

    if not out_top_dir:
        file_path = fastq_paths_r1[0]
        out_top_dir = os.path.dirname(file_path)

    # # Concatenate FASTQ files

    sample_barcodes = {}
    barcode_samples = {}

    # Read CVS
    with open(barcode_csv, 'rU') as file_obj:
        csv_data = csv.reader(file_obj)
        header = next(csv_data)

        for seq_run_id, barcode_name, barcode_seq, sample_name in csv_data:
            barcode_name = barcode_name.replace('-', '_')
            sample_name = sample_name.replace(' ', '_')

            if sample_name in barcode_samples:
                util.critical(
                    'Multiple samples/strains with name "%s" present in CSV file %s'
                    % (sample_name, barcode_csv))

            barcode_samples[sample_name] = barcode_name
            sample_barcodes[barcode_name] = (seq_run_id, sample_name)

    # Make subdirs
    for sample_name in barcode_samples:
        dir_name = os.path.join(out_top_dir, sub_dir_name, sample_name)
        util.makedirs(dir_name, exist_ok=True)

    # Concatenate FASTQ files with the same barcode for each read
    # and save results in corresponding strain folder
    # - now makes a symbolic link of only one file for a strain and not gzipped

    strain_fastq_paths = {}

    for barcode_name in sample_barcodes:
        seq_run_id, sample_name = sample_barcodes[barcode_name]
        file_pattern = '%s*%s*' % (seq_run_id, barcode_name)

        if fastq_paths_r2:
            out_file_name_1 = '%s_r_1%s' % (sample_name, file_ext)
            out_file_name_2 = '%s_r_2%s' % (sample_name, file_ext)
            in_fastq_paths_1 = util.match_files(
                fastq_paths_r1,
                file_pattern)  # Read pairs files already separated
            in_fastq_paths_2 = util.match_files(fastq_paths_r2, file_pattern)

            io_paths = [(in_fastq_paths_1, out_file_name_1),
                        (in_fastq_paths_2, out_file_name_2)]

        else:
            out_file_name = '%s%s' % (sample_name, file_ext)
            in_fastq_paths = util.match_files(fastq_paths_r1, file_pattern)

            io_paths = [(in_fastq_paths, out_file_name)]

        fastq_paths = []

        for in_fastq_paths, out_file_name in io_paths:
            if not in_fastq_paths:
                util.critical(
                    'No FASTQ read files found for run %s barcode %s' %
                    (seq_run_id, barcode_name))

            out_fastq_path = os.path.join(out_top_dir, sub_dir_name,
                                          sample_name, out_file_name)

            if os.path.exists(out_fastq_path):
                util.warn(
                    'FASTQ file %s already exists and won\'t be overwritten...'
                    % out_fastq_path)

            else:
                # Concatenate or sym link

                if len(in_fastq_paths
                       ) == 1 and not in_fastq_paths[0].endswith('.gz'):
                    util.info('Sym linking %s reads to %s' %
                              (barcode_name, out_fastq_path))
                    os.symlink(in_fastq_paths[0], out_fastq_path)

                else:
                    with open(out_fastq_path, 'wb') as out_file_obj:
                        util.info('Concatenating %s reads to %s' %
                                  (barcode_name, out_fastq_path))

                        for fastq_path in in_fastq_paths:
                            shutil.copyfileobj(util.open_file(
                                fastq_path,
                                'rb'), out_file_obj)  # Accepts GZIP input

            fastq_paths.append(out_fastq_path)

        strain_fastq_paths[sample_name] = fastq_paths

    return strain_fastq_paths
def cross_fil_background(strain_vcf_files, out_vcf_path=None, min_num_obs=3):
    
  for file_path in strain_vcf_files:
    is_ok, msg = util.check_regular_file(file_path)
  
    if not is_ok:
      util.critical(msg)
  
  if not out_vcf_path:
    out_vcf_path = 'bg_s%d_m%d.vcf' % (len(strain_vcf_files), min_num_obs)
  
  file_root, file_ext = os.path.splitext(out_vcf_path)
  
  comb_vcf_path = '%s_comb_input.vcf' % file_root
  temp_comb_vcf_path = util.get_temp_path(comb_vcf_path)
  
  util.info('Creating background VCF file for %d input files' % (len(strain_vcf_files)))
  
  # Combine each strain's diploid variants into a combined VCF
  # vcfcombine: Combine multiple VCF files together, handling samples when alternate allele descriptions are identical
  # vcfintersect -u any better?
   
  cmd_args = [exe.EXE['vcfcombine']]
  cmd_args += strain_vcf_files
  util.call(cmd_args, stdout=temp_comb_vcf_path)
  
  # Check chromosome sorting 
  
  cmd_args = [exe.EXE['vcfstreamsort'], '-a']
  util.call(cmd_args, stdin=temp_comb_vcf_path, stdout=comb_vcf_path)
  
  os.unlink(temp_comb_vcf_path)
  
  # Filter on number of samples represented in the genotype fields
  # i.e. for vars that occur at least a given number of times across strainss
  
  out_file_obj = open(out_vcf_path, 'w')
  write = out_file_obj.write
  num_samples = None # Filled from header info
  
  with open(comb_vcf_path) as file_obj:
    for line in file_obj:
      if line[0] == '#':
        if line[1:6] == 'CHROM':
          header = line.split()
          
          if len(header) < 10:
            util.critical('Cannot filter sample genotypes in a VCF file without FORMAT and sample/genotype information')
           
          else: 
            sample_names = header[9:] #Could use in future to track which strains are selected
            num_samples = len(sample_names)
            
        write(line)
      
      else:
        data = line.split()      
        genotypes = data[9:]
        if ref_allele is None:
          num_obs = num_samples - genotypes.count('.')
        else:
          genotypes2 = []
          for r in range(len(ref_allele)):
            genotypes2 = genotypes2 + [ref_allele[r] in x for x in genotypes]
          num_obs = num_samples - genotypes2.count(True)
        
        if num_obs >= min_num_obs:
          write(line)      
  
  util.info('Background VCF file output at "%s"' % (out_vcf_path, ))
Exemple #14
0
def genome_map(aligner,
               strain_name,
               strain_num,
               fastq_paths,
               genome_index_path,
               genome_fasta_path,
               num_cpu=util.MAX_CORES):

    dir_name, base_name = os.path.split(fastq_paths[0])

    path_root = os.path.join(dir_name, strain_name)
    sam_file_path = '%s.sam' % path_root

    if os.path.exists(sam_file_path):
        util.info("SAM file %s already exists. Skipping genome mapping" %
                  sam_file_path)

    else:
        util.info("Running aligner %s on %s..." % (aligner, strain_name))

        if aligner == ALIGNER_BWA:
            rg_header = "@RG\\tID:%s\\tSM:sample_%s\\tPL:illumina\\tLB:lib%d\\tPU:unit%d" % (
                strain_name, strain_name, strain_num, strain_num)
            cmd_args = [
                exe.EXE[ALIGNER_BWA],
                'mem',
                '-t',
                str(num_cpu),
                '-M',
                '-R',
                rg_header,
                #genome_index_path] + list(fastq_paths)
                genome_fasta_path
            ] + list(fastq_paths)
            util.call(cmd_args, stdout=open(sam_file_path, 'w'))

        elif aligner == ALIGNER_BT2:
            cmd_args = [
                exe.EXE[ALIGNER_BT2],
                '--sensitive',
                '-x',
                genome_index_path,
                '-p',
                str(num_cpu),
                '-q',  # FASTQ input
                '--rg-id',
                strain_name,
                '--rg',
                "SM:sample_%s\tPL:illumina\tLB:lib%d\tPU:unit%d" %
                (strain_name, strain_num, strain_num),
                '-S',
                sam_file_path
            ]

            if len(fastq_paths) > 1:
                cmd_args += ['-1', fastq_paths[0], '-2', fastq_paths[1]]
            else:
                cmd_args += ['-U', fastq_paths[0]]

            util.call(cmd_args)

        else:  # bbmap
            cmd_args = [
                exe.EXE[ALIGNER_BBMAP],
                'ref=%s' % genome_fasta_path,
                'path=%s' % genome_index_path, 'sam=1.3',
                'in=%s' % fastq_paths[0],
                'out=%s' % sam_file_path,
                't=%d' % num_cpu,
                'rgid=%s' % strain_name,
                'rgsm=sample_%s' % strain_name, 'rgpl=illumina',
                'rglb=lib%d' % strain_num,
                'rgpu=unit%d' % strain_num
            ]

            if len(fastq_paths) > 1:
                cmd_args += ['in2=%s' % fastq_paths[1]]

            util.call(cmd_args)

    util.info('Done %s genome alignment for strain %s' %
              (aligner, strain_name))

    return sam_file_path
Exemple #15
0
def cross_fil_map(barcode_csv,
                  genome_fasta_path,
                  exon_gff_path,
                  fastq_paths_r1,
                  fastq_paths_r2=None,
                  out_top_dir=None,
                  aligner=ALIGNER_BWA,
                  bowtie2_index=None,
                  bbmap_index=None,
                  num_cpu=util.MAX_CORES,
                  sub_dir_name=None,
                  file_ext=None):

    if not sub_dir_name:
        sub_dir_name = 'strain_%s' % aligner

    if fastq_paths_r2:
        fastq_paths_r2_list = fastq_paths_r2
    else:
        fastq_paths_r2_list = []

    #for file_path in [barcode_csv, genome_fasta_path, exon_gff_path] + fastq_paths_r1 + fastq_paths_r2 or []:
    for file_path in [barcode_csv
                      ] + fastq_paths_r1 + fastq_paths_r2_list or []:
        is_ok, msg = util.check_regular_file(file_path)

        if not is_ok:
            util.critical(msg)

    if not file_ext:
        file_ext = util.get_file_ext(fastq_paths_r1[0])

    if not out_top_dir:
        file_path = fastq_paths_r1[0]
        out_top_dir = os.path.dirname(file_path)

    # # Concatenate FASTQ files

    sample_barcodes = {}
    barcode_samples = {}

    # Read CVS
    with open(barcode_csv, 'rU') as file_obj:
        csv_data = csv.reader(file_obj)
        header = next(csv_data)

        for seq_run_id, barcode_name, barcode_seq, sample_name in csv_data:
            barcode_name = barcode_name.replace('-', '_')
            sample_name = sample_name.replace(' ', '_')

            if sample_name in barcode_samples:
                util.critical(
                    'Multiple samples/strains with name "%s" present in CSV file %s'
                    % (sample_name, barcode_csv))

            barcode_samples[sample_name] = barcode_name
            sample_barcodes[barcode_name] = (seq_run_id, sample_name)

    # Make subdirs
    for sample_name in barcode_samples:
        dir_name = os.path.join(out_top_dir, sub_dir_name, sample_name)
        util.makedirs(dir_name, exist_ok=True)

    # Concatenate FASTQ files with the same barcode for each read
    # and save results in corresponding strain folder
    # - now makes a symbolic link of only one file for a strain and not gzipped

    strain_fastq_paths = {}

    for barcode_name in sample_barcodes:
        seq_run_id, sample_name = sample_barcodes[barcode_name]
        file_pattern = '%s*%s*' % (seq_run_id, barcode_name)

        if fastq_paths_r2:
            out_file_name_1 = '%s_r_1%s' % (sample_name, file_ext)
            out_file_name_2 = '%s_r_2%s' % (sample_name, file_ext)
            in_fastq_paths_1 = util.match_files(
                fastq_paths_r1,
                file_pattern)  # Read pairs files already separated
            in_fastq_paths_2 = util.match_files(fastq_paths_r2, file_pattern)

            io_paths = [(in_fastq_paths_1, out_file_name_1),
                        (in_fastq_paths_2, out_file_name_2)]

        else:
            out_file_name = '%s%s' % (sample_name, file_ext)
            in_fastq_paths = util.match_files(fastq_paths_r1, file_pattern)

            io_paths = [(in_fastq_paths, out_file_name)]

        fastq_paths = []

        for in_fastq_paths, out_file_name in io_paths:
            if not in_fastq_paths:
                util.critical(
                    'No FASTQ read files found for run %s barcode %s' %
                    (seq_run_id, barcode_name))

            out_fastq_path = os.path.join(out_top_dir, sub_dir_name,
                                          sample_name, out_file_name)

            if os.path.exists(out_fastq_path):
                util.warn(
                    'FASTQ file %s already exists and won\'t be overwritten...'
                    % out_fastq_path)

            else:
                # Concatenate or sym link

                if len(in_fastq_paths
                       ) == 1 and not in_fastq_paths[0].endswith('.gz'):
                    util.info('Sym linking %s reads to %s' %
                              (barcode_name, out_fastq_path))
                    os.symlink(in_fastq_paths[0], out_fastq_path)

                else:
                    with open(out_fastq_path, 'wb') as out_file_obj:
                        util.info('Concatenating %s reads to %s' %
                                  (barcode_name, out_fastq_path))

                        for fastq_path in in_fastq_paths:
                            shutil.copyfileobj(util.open_file(
                                fastq_path,
                                'rb'), out_file_obj)  # Accepts GZIP input

            fastq_paths.append(out_fastq_path)

        strain_fastq_paths[sample_name] = fastq_paths

    # Genome alignment/mapping
    if aligner == ALIGNER_BT2:
        genome_index = bowtie2_index
    elif aligner == ALIGNER_BBMAP:
        genome_index = bbmap_index
    else:
        genome_index = None

    sam_paths = []
    for i, strain_name in enumerate(strain_fastq_paths):
        sam_path = genome_map(aligner=aligner,
                              strain_name=strain_name,
                              strain_num=i + 1,
                              fastq_paths=strain_fastq_paths[strain_name],
                              genome_index_path=genome_index,
                              genome_fasta_path=genome_fasta_path,
                              num_cpu=num_cpu)
        sam_paths.append(sam_path)

    # Parallel BAM cleanup
    # samtools sort can use multiple cores but is generally I/O cound in any case
    bam_paths = util.parallel_split_job(sam_cleanup,
                                        sam_paths, [],
                                        num_cpu,
                                        collect_output=True)

    # Parallel coverage with BEDtools
    common_args = [genome_fasta_path, exon_gff_path]
    util.parallel_split_job(bedtools_coverage,
                            bam_paths,
                            common_args,
                            num_cpu,
                            collect_output=False)

    util.info('%s finished for %d input strains' %
              (PROG_NAME, len(strain_fastq_paths)))
def cross_fil_genotype(bam_file_paths,
                       genome_fasta_path,
                       var_caller=CALLER_FREEBAYES,
                       num_cpu=util.MAX_CORES,
                       out_dir=None,
                       sub_dir_name=None,
                       homozygous=True):

    import subprocess

    # Main function to call genotypes given (cleaned) input BAM files with the option of different caller programs
    # Requires filtered, duplicate marked BAM file for each sample/strain
    if not sub_dir_name:
        sub_dir_name = 'vcf_%s' % var_caller

    if not out_dir:
        out_dir = os.getcwd()

    for file_path in bam_file_paths + [genome_fasta_path]:
        is_ok, msg = util.check_regular_file(file_path)

        if not is_ok:
            util.critical(msg)

    strain_bam_paths = {}
    for bam_file_path in sorted(bam_file_paths):
        file_name = os.path.basename(bam_file_path)

        if util.FILE_TAG not in file_name:
            msg = 'BAM file name %s does not contain %s; it does not appear to have been created by cross_fil_map' % (
                file_name, util.FILE_TAG)
            util.critical(msg)

        sample_name = file_name.split(util.FILE_TAG)[0]
        strain_bam_paths[sample_name] = bam_file_path

    strain_names = list(strain_bam_paths.keys())
    num_strains = len(strain_names)
    # Code for different varient callers splits early as the pipelines differ somewhat

    if var_caller == CALLER_GATK:
        # Create separate gVCFS, combine and call genotype
        merged_vcf_path = call_genotype_gatk(strain_bam_paths,
                                             genome_fasta_path, num_cpu,
                                             out_dir, sub_dir_name)

    else:
        # Call genotype directly on multiple BAM files
        merged_vcf_path = call_genotype_freebayes(strain_bam_paths,
                                                  genome_fasta_path, num_cpu,
                                                  out_dir, sub_dir_name)

    # Select variants for each strain from combined genotype VCF file - parallelise strains in python
    # Only homozygous
    if homozygous:
        file_tag = 'homozy_%s' % var_caller
    # All variants detected per strain
    else:
        file_tag = 'extracted_%s' % var_caller
    common_args = [merged_vcf_path, genome_fasta_path, file_tag, homozygous]
    util.parallel_split_job(gatk_select_vars, strain_names, common_args,
                            num_cpu)

    util.info('%s finished for %d input strains' % (PROG_NAME, num_strains))
Exemple #17
0
def run_aligner(trimmed_fq,fastq_dirs,aligner='bowtie2',guide_library='bassik',reference_fasta=None,genome_index=None,num_cpu=util.MAX_CORES, is_single_end=True,pair_tags=['r_1','r_2'],aligner_args=None,convert_to_bam=True):
  # Generate genome indexes if not provided
  if aligner == 'bowtie':
    genome_index_default = os.path.dirname(reference_fasta) + '/bt-genome/'
    index_builder = 'bowtie-build'
  elif aligner == 'bowtie2':
    genome_index_default = os.path.dirname(reference_fasta) + '/bt2-genome/'
    index_builder = 'bowtie2-build'
  if aligner in [ 'bowtie', 'bowtie2']:
    if genome_index is None:
      genome_index = genome_index_default
      util.warn('Folder where %s indices are located hasn\'t been specified. Program will default to %s...' % (aligner,genome_index))
      base = os.path.basename(reference_fasta).split('.')[:-1]
      base = '.'.join(base)
      base = genome_index + base
      if not os.path.exists(genome_index):
        os.mkdir(genome_index)
        util.info('Bowtie2 indices not found. Generating indices...')
        cmdArgs = [index_builder,reference_fasta,base]
        util.call(cmdArgs)
      genome_index = base
    
    # Alignment
    util.info('Aligning reads using %s...' % aligner)
    
    def format_aligner_input(trimmed_fq,aligner,aligner_args,is_single_end,convert_to_bam):
      if aligner == 'bowtie':
        ext = 'bt'
      elif aligner == 'bowtie2':
        ext = 'bt2'
      k = 0 
      if is_single_end:
        sam_log_list = []
        for f in trimmed_fq:
          cmdArgs = [aligner] + aligner_args
          fo = os.path.basename(f)
          fo = fastq_dirs[k]+ '/' + fo
          sam = fo + '.%s.sam' % ext
          log = fo + '.%s.log' % ext
          sam_log_list.append([f,sam,log])
        return(sam_log_list)
    
    if aligner == 'bowtie':
      if convert_to_bam:
        sam_args = ['-S','--no-unal']
      else:
          sam_args = []
      if aligner_args is None:
        aligner_args = ['-v', '0', '-m', '1', '--strata', '--best'] # allow no mismatches and report reads that align only once
        if guide_library == 'bassik':
          aligner_args = aligner_args + ['-5','1']
      sam_log_list = format_aligner_input(trimmed_fq=trimmed_fq,aligner=aligner,aligner_args=aligner_args,is_single_end=is_single_end,convert_to_bam=convert_to_bam)
      file_list = []
      for f, sam , log in sam_log_list:
        if convert_to_bam:
          wd = os.path.dirname(sam)
          sam_header = os.path.basename(sam).split('.')[:-1]
          sam_header = '.'.join(sam_header)
          check_exists = wd + '/' + sam_header + '.bam'
        else:
          check_exists = sam
        file_list.append(sam)
        if pragui.exists_skip(check_exists):
        # Function pragui.exists_skip() determines if next step should go ahead.
        # It skips the next step if the file path provided exists.
        # This prevents overwriting files and also saves processing time.
          cmdArgs = [aligner] + aligner_args + ['-p',str(num_cpu), genome_index,f] + sam_args + [sam]
          util.call(cmdArgs,stderr=log)
          
    if aligner == 'bowtie2':
      if convert_to_bam:
          header_opt = []
      else:
        header_opt = ['--no-hd']
      if aligner_args is None:
        # Allow no mismatches and no pre-alignment before multiseed heuristic
        aligner_args = ['-N','0','--no-1mm-upfront','--score-min', 'L,0,0', '--no-unal'] + header_opt
        if guide_library == 'bassik':
          aligner_args = aligner_args + ['-5','1']
      sam_log_list = format_aligner_input(trimmed_fq=trimmed_fq,aligner=aligner,aligner_args=aligner_args,is_single_end=is_single_end,convert_to_bam=convert_to_bam)
      file_list = []
      for f, sam , log in sam_log_list:
        if convert_to_bam:
          wd = os.path.dirname(sam)
          sam_header = os.path.basename(sam).split('.')[:-1]
          sam_header = '.'.join(sam_header)
          check_exists = wd + '/' + sam_header + '.bam'
        else:
          check_exists = sam
        file_list.append(sam)
        if pragui.exists_skip(check_exists):
          cmdArgs = [aligner] + aligner_args + ['-p',str(num_cpu),'-x', genome_index,'-U', f, '-S', sam]
          util.call(cmdArgs,stderr=log)
            
    # Convert sam to bam
    if convert_to_bam is True:
      file_list = []
      for f, sam , log in sam_log_list:
        wd = os.path.dirname(sam)
        sam_header = os.path.basename(sam).split('.')[:-1]
        sam_header = '.'.join(sam_header)
        check_exists = wd + '/' + sam_header + '.bam'
        if pragui.exists_skip(check_exists):
          file = convert_sam_to_bam(sam=sam)
        else:
          file = check_exists
        file_list.append(file)
    
    return(file_list)
Exemple #18
0
def tsv_format(counts_file_list,reference_fasta,software=list('mageck' or 'bagel')[1]):
  
  wd = counts_file_list[0].split('/')[:-1]
  wd = "/".join(wd)
  counts_aggregated_file='%s/counts_aggregated_%s.tsv' % (wd,software)
  
  util.info('Generating guide counts file in %s format. Results saved in %s...' % (software,counts_aggregated_file))
  
  # Generates reference list of all library sgRNAs
  fasta_obj = open(reference_fasta,'r')
  sgrnas_list0 = []
  sgRNA_output = []
  gene_output = []
  for line in fasta_obj:
    if '>' in line:
      line = line.rstrip('\n')
      line = line.lstrip('>')
      s,g = line.split("_", 1)
      sgrnas_list0.append(line)
      sgRNA_output.append(g)
      gene_output.append(s)

  # Generates reference Pandas data frame from sgRNA list library file
  if software == 'mageck':
    d0 = {'sgRNA':pd.Series(sgRNA_output),'gene':pd.Series(gene_output),'sgRNA2':pd.Series(sgrnas_list0)}
  elif software == 'bagel':
    d0 = {'SEQID':pd.Series(sgRNA_output),'GENE':pd.Series(gene_output),'sgRNA2':pd.Series(sgrnas_list0)}
  else:
    util.critical('CRISPR software tool must be either mageck or bagel.')
  
  dfjoin1 = pd.DataFrame(d0) #sgRNA/gene column required for MAGeCK, sgRNA2 is needed for join operation (deleted later)


  counts_file_list.sort()
  counts_file_list2 = [w.replace('.txt','') for w in counts_file_list]
  counts_file_list2 = [w.split('/')[-1] for w in counts_file_list2]
  

  # Counts number of .txt files in script folder
  txtnumber = len(counts_file_list)

  # Generates list of lists for join function output
  cycle = 1
  master_count_list0 = []
  while cycle <= txtnumber:
      master_count_list0.append("count_list"+ str(cycle))
      cycle +=1
  master_count_list1 = []
  for i in master_count_list0:
      master_count_list1.append([i])
    
  cycle = 1
  master_sgrna_list0 = []
  while cycle <= txtnumber:
      master_sgrna_list0.append("sgrna_list"+ str(cycle))
      cycle +=1
  master_sgrna_list1 = []
  for i in master_sgrna_list0:
      master_sgrna_list1.append([i])
  
  if software == 'bagel':
    cycle = 1
    master_joined_count_list0 = []
    while cycle <= txtnumber:
        master_joined_count_list0.append("joined_count_list"+ str(cycle))
        cycle +=1
    master_joined_count_list1 = []
    for i in master_joined_count_list0:
        master_joined_count_list1.append([i])
  
  # Generates Pandas data frame and adds each of the count files in the folder to it after joining
  counter = 0
  while counter < txtnumber:
      # Opens count files and extract counts and sgRNA names
      file = list(csv.reader(open(counts_file_list [counter])))
    
      for x in file:
          a = str(x)
          if a.count(' ') > 1:
              z,b,c = a.split()
              bint = int(b)
              cmod = c.replace("']","")
              master_count_list1 [counter].append(bint)
              master_sgrna_list1 [counter].append(cmod)
          else:
              b,c = a.split()
              bint = b.replace("['","")
              bint = int(bint)
              cmod = c.replace("']","")
              master_count_list1 [counter].append(bint)
              master_sgrna_list1 [counter].append(cmod)
    
      # Generates Pandas data frame for the data
      d1 = {'sgRNA2':pd.Series(master_sgrna_list1 [counter]),
          counts_file_list2 [counter]:pd.Series(master_count_list1 [counter])}
      df1 = pd.DataFrame(d1)

      # Performs left join to merge Pandas data frames sets:
      dfjoin1 = pd.merge(dfjoin1, df1, on='sgRNA2', how='left')
      dfjoin1 = dfjoin1.fillna(0) #Replaces nan with zero
       
      counter +=1

  # Deletes sgRNA2 column from dataframe (only needed for joining)
  dfjoin2 = dfjoin1.drop(columns='sgRNA2')
 
  # Writes all data to a single .tsv file, ready for either MAGeCK or Bagel
  dfjoin2.to_csv(counts_aggregated_file, sep='\t',index=False)
  
  return(dfjoin2)
Exemple #19
0
def bedtools_coverage(bam_file_path, genome_fasta_path, exon_gff_file_path):

    dir_name, base_name = os.path.split(bam_file_path)
    file_root = os.path.splitext(base_name)[0]
    dir_name = os.path.join(dir_name, 'coverage')

    util.makedirs(
        dir_name,
        exist_ok=True)  # Not the os version to be Python 2 and 3 compatible

    genome_cvr_file_path = os.path.join(dir_name, base_name + '.genomecov')
    exon_cvr_file_path = os.path.join(dir_name, base_name + '_exon.coverage')
    exon_cvr_temp_file_path = os.path.join(dir_name,
                                           base_name + '_exon.coverage.temp')
    R_cvr_file_path = os.path.join(dir_name, base_name + '_R_coverage.out')

    if os.path.exists(R_cvr_file_path):
        util.info(
            "Coverage file %s already exists. Skipping coverage calculations" %
            (R_cvr_file_path, ))
        return

    bedtools_exe = exe.EXE['bedtools']

    util.info("Running bedtools genomecov...")

    #  cmd_args = [bedtools_exe, 'genomecov', '-ibam', bam_file_path, '-g', genome_fasta_path]
    cmd_args = [bedtools_exe, 'genomecov', '-pc', '-ibam', bam_file_path]
    util.call(cmd_args, stdout=genome_cvr_file_path)

    util.info("Done... Results saved in: %s" % genome_cvr_file_path)
    util.info("Converting %s into a sorted bed file..." % bam_file_path)

    temp_dir = os.path.join(dir_name, 'TEMP_%s' % uuid.uuid4())
    os.makedirs(temp_dir)

    temp_bed_file1 = os.path.join(temp_dir, '%s.bed' % file_root)
    temp_bed_file2 = os.path.join(temp_dir, '%s_sortBed.bed' % file_root)

    cmd_args = [bedtools_exe, 'bamtobed', '-i', bam_file_path]
    util.call(cmd_args, stdout=temp_bed_file1)

    #  cmd_args = [bedtools_exe, 'sort', '-i', temp_bed_file1]
    cmd_args = ['sort', '-k1,1', '-k2,2n', '--batch-size=5',
                temp_bed_file1]  # Update to reduce RAM usage
    util.call(cmd_args, stdout=temp_bed_file2)

    util.info("Done... Results saved in temporary directory: %s" % temp_dir)
    util.info("Running bedtools coverage...")

    #  cmd_args = [bedtools_exe, 'coverage', '-hist' ,'-a', exon_gff_file_path,'-b', temp_bed_file2]
    cmd_args = [
        bedtools_exe, 'coverage', '-sorted', '-hist', '-a', exon_gff_file_path,
        '-b', temp_bed_file2
    ]  # Update to reduce RAM usage
    util.call(cmd_args, stdout=exon_cvr_file_path)

    util.info("Done... Results saved in: %s" % exon_cvr_file_path)

    # In order to calculate exon coverage in R, we need to extract all lines starting with "all" from exon.coverage file
    # This file is saved in a temporary directory

    cmd_args = ['grep', 'all', exon_cvr_file_path]
    util.call(cmd_args, stdout=exon_cvr_temp_file_path)

    util.info(
        "Running R to compute mean genome coverage and mean exon coverage..")

    cmd_args = [
        'Rscript', '--vanilla', exe.EXE['mgcr'], genome_cvr_file_path,
        exon_cvr_temp_file_path
    ]
    util.call(cmd_args, stdout=R_cvr_file_path)

    util.info("Delete temporary directory and files...")
Exemple #20
0
def subtract_background(strain_vcf_path, background_vcf_path,
                        genome_fasta_path, out_dir, genome_version,
                        interval_length, output_tag):

    bg_path_root, bg_file_ext = os.path.splitext(background_vcf_path)
    bg_file_root = os.path.basename(bg_path_root)

    util.info('Filtering VCF file %s' % strain_vcf_path)

    path_root, file_ext = os.path.splitext(strain_vcf_path)

    if out_dir:
        file_root = os.path.basename(path_root)
        path_root = os.path.join(out_dir, file_root)

    path_root = '%s%s%s' % (
        path_root, output_tag, bg_file_root
    )  # Combines background name and sample/strain name

    out_vcf_path = path_root + '.vcf'
    out_vcf_path = util.get_safe_file_path(out_vcf_path)  # Avoid overwrites
    path_root, file_ext = os.path.splitext(
        out_vcf_path
    )  # Path root should have changed if a substitute name was used

    snpeff_vcf_path = path_root + '_SnpEff.vcf'
    snpeff_summ_path = path_root + '_summary.html'
    snpsift_tab_path = path_root + '_SnpSift.tabular'

    cmd_args = list(util.JAVA) + [
        '-jar',
        exe.EXE['gatk'],
        '-T',
        'SelectVariants',
        '-R',
        genome_fasta_path,
        '-V',
        strain_vcf_path,
        '--discordance',
        background_vcf_path,
        '-o',
        out_vcf_path,
    ]

    util.call(cmd_args)

    util.info('Running SnpEff on %s' % out_vcf_path)

    # Run SnpEff on resulting VCF file

    cmd_args = list(util.JAVA) + [
        '-jar', exe.EXE['snpeff'], '-v', '-upDownStreamLen',
        str(interval_length), '-stats', snpeff_summ_path, genome_version,
        out_vcf_path
    ]

    util.call(cmd_args, stdout=snpeff_vcf_path)

    # Create tabular output from VCF file using SnpSift

    util.info('Running SnpSift on %s' % snpeff_vcf_path)

    cmd_args = list(util.JAVA) + [
        '-jar', exe.EXE['snpsift'], 'extractFields', snpeff_vcf_path, '-s',
        ',', '-e', '.', 'CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'DP',
        'ANN[*].ERRORS', 'ANN[*].GENEID', 'ANN[*].GENE', 'ANN[*].BIOTYPE',
        'ANN[*].TRID', 'ANN[*].RANK', 'ANN[*].EFFECT', 'ANN[*].IMPACT:',
        'ANN[*].HGVS_P', 'ANN[*].HGVS_C', 'ANN[*].CDS_POS', 'ANN[*].CDS_LEN',
        'ANN[*].DISTANCE'
    ]

    util.call(cmd_args, stdout=snpsift_tab_path)

    util.info('Results saved to %s and similarly named analysis files' %
              out_vcf_path)